This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
crawl_wiki.py
149 lines (119 loc) · 4.22 KB
/
crawl_wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Wikipedia crawler service (downloader)
@author TaoPR (github.com/starcolon)
---
The service runs in background and keeps crawling raw knowledge data.
The downloaded data from Wikipedia is senquentially pushed to
the specified RabbitMQ.
"""
import sys
import asyncio
import argparse
from functools import reduce
from termcolor import colored
from pylib.spider import wiki as Wiki
from pylib.knowledge.datasource import MineDB
arguments = argparse.ArgumentParser()
arguments.add_argument('--verbose', dest='verbose', action='store_true', help='Turn verbose output on.')
arguments.add_argument('--depth', type=int, default=4, help='Indicate maximum depth of crawling level')
args = vars(arguments.parse_args(sys.argv[1:]))
"""
Initialise a lazy connection to the crawling record collection
"""
def init_crawl_collection():
crawl_collection = MineDB('localhost','vor','crawl')
return crawl_collection
"""
Save the content of the wikipedia page
"""
def save_content(crawl_collection,title,content):
if crawl_collection.count({'title': title})>0:
crawl_collection.update(
{'title': title},
{'$set':{'downloaded':True, 'content': content}}
)
else:
crawl_collection.insert({'title': title, 'downloaded': True, 'content':content})
"""
Check whether a wiki has been downloaded
"""
def is_downloaded(crawl_collection,title):
return crawl_collection.count({'title': title, 'downloaded': True})>0
"""
Mark a wiki page as downloaded
"""
def mark_as_downloaded(crawl_collection,title):
if crawl_collection.count({'title': title})>0:
crawl_collection.update({'title': title},{'$set':{'downloaded':True}})
else:
crawl_collection.insert({'title': title, 'downloaded': True, 'content':None})
def list_crawl_pending(crawl_collection,max_samples):
n = 0
# Major pending list
majors = [t['title'] for t in crawl_collection.query({'downloaded': False})]
# If fresh new crawl, no pending downloads,
# initialise with a seed
if len(majors)==0:
print(colored('Fresh new crawling...','yellow'))
yield '/wiki/Graph_theory'
for m in majors:
n += 1
yield m
# List rels of downloaded major pages
# which are not yet downloaded
for t in crawl_collection.query({'downloaded': True}):
content = t['content']
rels = content['rels']
for r in rels:
# Skip the downloaded links
if not is_downloaded(crawl_collection,r):
# Short circuit if maximum number of samples to generate exceeded
if n>max_samples:
return
n += 1
yield r
"""
Add a fresh new wiki page as yet to be downloaded
"""
def add_pending(crawl_collection,title):
if crawl_collection.count({'title': title})==0:
crawl_collection.insert({'title': title, 'downloaded': False, 'content':None})
"""
Execute a crawling subprocess on the destination wiki page title
"""
@asyncio.coroutine
def crawl(crawl_collection,title,depth,verbose):
loop = asyncio.get_event_loop()
# Crawl the content
add_pending(crawl_collection, title)
# Skip if downloaded or depth recursion exceeded
if depth>0 and not is_downloaded(crawl_collection, title):
content = Wiki.download_wiki('https://en.wikipedia.org' + title, verbose)
# Store the downloaded content in MongoDB
save_content(crawl_collection, title, content)
# Now recursively download the related links
subtasks = []
for rel in content['rels']:
subtasks.append(asyncio.async(
crawl(crawl_collection, rel, depth-1, verbose)
))
loop.run_until_complete(asyncio.wait(subtasks))
if __name__ == '__main__':
depth = args['depth']
print(colored('# Max depth to run: ','cyan'), depth)
loop = asyncio.get_event_loop()
# Prepare the crawling record handler
crawl_collection = init_crawl_collection()
# Load all list of pending wiki pages
pendings = list_crawl_pending(crawl_collection,max_samples=32)
# For each of the pending list, spawns a new crawler subprocess
# to download those data
tasks = []
for title in pendings:
print(colored('Pending for crawling: ','green'), title)
tasks.append(asyncio.async(
crawl(crawl_collection, title, depth, args['verbose'])
))
# Wait until all top-level async crawling tasks end
loop.run_until_complete(asyncio.wait(tasks))
loop.close()