-
Notifications
You must be signed in to change notification settings - Fork 3
/
bilibili_info.py
77 lines (58 loc) · 2 KB
/
bilibili_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# scrape Bilibili video info
from acrawler import Crawler, Item, Request, register
from acrawler.handlers import ItemToMongo, ItemToRedis
MIN_TID = 1
MAX_TID = 250
class ChannelItem(Item):
log = True
class OneItem(Item):
pass
class BiliInfoCrawler(Crawler):
config = {"DOWNLOAD_DELAY": 0.2, "MAX_REQUESTS": 3, "MAX_WORKERS": 20}
async def start_requests(self):
for tid in range(MIN_TID, MAX_TID + 1):
url = "http://api.bilibili.com/x/web-interface/newlist?rid={}&pn=1&ps=1".format(
tid
)
yield Request(url, callback=self.parse_json, meta={"tid": tid})
def parse_json(self, response):
tid = response.meta["tid"]
info = response.json
data = info["data"]
page = data["page"]
count = page["count"]
if count > 1:
v_info = data["archives"][0]
new_tid = v_info["tid"]
if new_tid == tid:
tname = v_info["tname"]
item = {
"tid": tid,
"tname": tname,
"count": count,
"url": f"http://api.bilibili.com/x/web-interface/newlist?rid={tid}",
}
yield ChannelItem(extra=item)
for pn, ps in self.get_pn_ps(count):
url = f"http://api.bilibili.com/x/web-interface/newlist?rid={tid}&pn={pn}&ps={ps}"
yield OneItem(extra={"url": url})
def get_pn_ps(self, count):
ps = 50
max_n = count // 50 + 1
for pn in range(1, max_n + 1):
yield pn, ps
@register()
class BiliToRedis(ItemToRedis):
family = "OneItem"
items_key = "bili:vsurls"
async def handle_after(self, item):
url = item["url"]
await self.redis.sadd(self.items_key, url)
@register()
class BiliToMongo(ItemToMongo):
family = "ChannelItem"
db_name = "bili"
col_name = "channel"
primary_key = "tid"
if __name__ == "__main__":
BiliInfoCrawler().run()