-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathdouban_book_scraper.py
176 lines (153 loc) · 5.67 KB
/
douban_book_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
import os
import csv
import re
import urllib.request
import io
import sys
import time
import random
topnum = 1
#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')#改变默认输出编码
#将url转换为HTML源码
def getHtml(url):
try:
page = urllib.request.urlopen(url)
html = page.read()
except:
print("failed to geturl")
return ''
else:
return html
#通过正则表达式获取该网页下每本书的title(换行符没去掉)
def getTitle(html):
nameList = re.findall(r'<a href="https.*?".*?target="_blank">(.*?)</a>',html,re.S)
newNameList = [];
global topnum
for index,item in enumerate(nameList):
if item.find("img") == -1:#通过检测img,只保留中文标题
#item.replace('\n','')
#item.strip()
#item.splitlines()
#re.sub('\r|\n', '', item)
if topnum%26 !=0:
#newNameList.append("Top " + str(topnum) + " " + item);
newNameList.append(item);
topnum += 1;
return newNameList
#通过点击图片链接进入每本书的详情页
def getDetail(html):
detailList = re.findall(r'<a href="(https.*?)".*?target="_blank">.*?</a>',html,re.S)
newDetailList = []
for index,item in enumerate(detailList):
if item.find("subject") != -1 and index % 2!=0:
newDetailList.append(item);
#print(item)
#html_detail=getHtml(item).decode("UTF-8")
#print(getIntroduction(html_detail))
#newIntroductionList.append(getIntroduction(html_detail))
#time.sleep(random.randint(2,5))
return newDetailList
#获取每本书的出版年份
def getPublishYear(html):
publishYearList = re.findall(r'<span class="pl">出版年.*?</span>(.*?)<br/>',html,re.S)
return publishYearList
#获取每本书的出版社
def getPress(html):
pressList = re.findall(r'<span class="pl">出版社.*?</span>(.*?)<br/>',html,re.S)
return pressList
#获取每本书的ISBN编码
def getIsbn(html):
isbnList = re.findall(r'<span class="pl">ISBN.*?</span>(.*?)<br/>',html,re.S)
return isbnList
#通过正则表达式获取该网页下每本书的图片链接
def getImg(html):
imgList = re.findall(r'img.*?width=.*?src="(http.*?)"',html,re.S)
newImgList = []
for index,item in enumerate(imgList):
if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find("icon") == -1 and item.find("png") == -1:
newImgList.append(item);
return newImgList;
#通过正则表达式获取该网页下每本书的评分
def getScore(html):
scoreList = re.findall(r'<span.*?class="rating_nums">(.*?)</span>',html,re.S)
return scoreList
#通过正则表达式获取该网页下每本书的评价总数
def getComment(html):
commentList = re.findall(r'<span>(.*?)</span>',html,re.S)
newcommentList =[]
for index,item in enumerate(commentList):
if item.find("评价") >= 1:
newcommentList.append(item);
return newcommentList
#将获取的信息进行保存
def saveInfo(infoList):
with open('D:/book_scraper.csv','w+',newline='',encoding='gb18030') as fp:
a = csv.writer(fp,delimiter = ',')#delimiter的意思是插入到csv文件中的一行记录以它分隔开
a.writerow(['书 名','评 分','评价人数','图片链接','出 版社','出版年份',' ISBN '])
a.writerows(infoList)
print('保存完毕')
#初始化
namesUrl = []
imgsUrl = []
scoresUrl = []
commentsUrl = []
detailsUrl = []
introductionsUrl = []
isbnsUrl = []
publishYearsUrl = []
newPresssUrl = []
allInfo = []
print ("Starting Main \n 普通爬取开始时时间")
print(time.ctime(time.time()))
#实现翻页,每页25个
for page in range(0,450,25):
url = "https://www.douban.com/doulist/1264675/?start={}".format(page)
html = getHtml(url).decode("UTF-8");
if html == '':
namesUrl.extend('none');
imgsUrl.extend('none')
scoresUrl.extend('none')
commentsUrl.extend('none')
introductionsUrl.extend('none')
else:
namesUrl.extend(getTitle(html))
imgsUrl.extend(getImg(html))
scoresUrl.extend(getScore(html))
commentsUrl.extend(getComment(html))
introductionsUrl.extend(getDetail(html))
namesUrl.pop()#删除最后一个无用元素
#进入书的详情页,获取出版社、isbn等信息
for index,item in enumerate(introductionsUrl):
print(item)
if getHtml(item) == '':#排除链接不存在的情况
newPresssUrl.append("该链接不存在")
publishYearsUrl.append("该链接不存在")
isbnsUrl.append("该链接不存在")
else:
html_detail=getHtml(item).decode("UTF-8")
#print(getIntroduction(html_detail))
newPresssUrl.append(getPress(html_detail))
publishYearsUrl.append(getPublishYear(html_detail))
isbnsUrl.append(getIsbn(html_detail))
time.sleep(random.randint(1,2))
for i in range(0,len(namesUrl)):
tmp=[]
tmp.append(namesUrl[i])
tmp.append(scoresUrl[i])
tmp.append(commentsUrl[i])
tmp.append(imgsUrl[i])
tmp.append(newPresssUrl[i])
tmp.append(publishYearsUrl[i])
tmp.append(isbnsUrl[i])
allInfo.append(tmp)
print(len(namesUrl))
print(len(commentsUrl))
print(len(imgsUrl))
print(len(scoresUrl))
print(len(newPresssUrl))
print(len(publishYearsUrl))
print(len(isbnsUrl))
saveInfo(allInfo)
print ("Exiting Main \n 普通爬取结束时时间")
print(time.ctime(time.time()))