-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathcrawl_multiprocessing.py
executable file
·156 lines (131 loc) · 5.5 KB
/
crawl_multiprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
header = '''
#####################################################################################################################
#Program: docbao crawler
#Author: hailoc12
#Version: 1.0.0
#Date: 14/06/2019
#Repository: http://github.com/hailoc12/docbao_crawler
#File: crawl.py
#Function: demo multiprocesseing crawling using firefox browser
#####################################################################################################################
'''
# IMPORT LIB
from lib import *
import multiprocessing
import os
import time
def crawler_process(process_name, lock, browser_list, crawl_queue, crawled_data):
# Function: work as an worker in multiprocessed crawling
# Input:
# lock: to acquire and release shared data
# browser_list: a shared queue of browser to release when timeout
# crawl_queue: a shared queue of "crawl task"
# crawled_data: Queue that contain crawled data
# Output:
# crawled_data: contain new crawled data
print("Crawler %s has been started" % process_name)
browser = BrowserWrapper()
lock.acquire()
browser_list.put(browser)
lock.release()
a = True
while a:
#try:
while True:
print("Crawler %s is running" % process_name)
# get a web config from crawl_queue
webconfig = None
lock.acquire()
if not crawl_queue.empty(): # have more job
webconfig = crawl_queue.get()
lock.release()
# crawl data
print("Crawler %s is crawling page %s" % (process_name, webconfig.get_webname()))
url = webconfig.get_crawl_url()
html = utils.read_url_source(url, webconfig, browser)
html_etree = etree.HTML(html)
title = html_etree.xpath('//title/text()')[0]
crawled_data.put({'webname': webconfig.get_webname(), 'title': title})
else:
lock.release()
#print("Browser is")
#print(browser)
if browser is not None:
print("Quit browser in Crawler %s" % process_name)
browser.quit()
print("Crawler %s is putting crawled data to main queues" % process_name)
print("Crawler %s has finished" % process_name)
return None
a= False
#except:
# print("There are some error in crawler %s" % process_name)
# if browser is not None:
# print("Quit browser in Crawler %s" % process_name)
# browser.quit()
# PROGRAM START HERE !
print(header)
### Change this !
crawl_urls = [{'webname':'Dân Trí', 'crawl_url':'http://dantri.com.vn'},
{'webname':'Vietnamnet', 'crawl_url':'http://vietnamnet.vn'},
{'webname':'Thanh Niên', 'crawl_url':'http://thanhnien.com.vn'}]
max_crawler = 3 # number of maximum Firefox browser can be used to crawl. Depend on server resources
# Create Manager Proxy to host shared data for multiprocessed crawled
with multiprocessing.Manager() as manager:
# share data between processes
crawl_queue = manager.Queue()
crawled_data = manager.Queue()
new_blacklists = manager.Queue()
browser_list = manager.Queue() # keep all firefox browser to release when timeout
lock = manager.Lock()
timeout_flag = manager.Value('i', 0) # shared variable to inform processes if timeout happends
# Init crawl queue
number_of_job = 0
for index in range(0, len(crawl_urls)):
webconfig = WebConfig()
webconfig.set_webname(crawl_urls[index]['webname'])
webconfig.set_config('crawl_url', crawl_urls[index]['crawl_url'])
webconfig.set_config('use_browser', True)
# set another config here, see crawl_login_page.py for details
#webconfig.set_config('browser_fast_load', True)
#webconfig.set_config('browser_profile', 'test_profile')
webconfig.set_config('display_browser', False) #note: display_browser=True won't work in SSH mode
crawl_queue.put(webconfig)
# Start crawl process
time.sleep(1)
print("%s crawlers are set to be run in parallel" % str(max_crawler))
crawler_processes = []
time.sleep(1)
print("Init %s crawlers" % str(max_crawler))
start = time.time()
for i in range(max_crawler):
crawler = multiprocessing.Process(target=crawler_process, args=(str(i+1), lock, browser_list, crawl_queue, crawled_data))
crawler_processes.append(crawler)
crawler.start()
time.sleep(1)
print("Start crawler number %s (pid: %s)" % (str(i+1), crawler.pid))
running = True
running_crawler = ""
count = 0
while running:
running = False
count = 0
running_crawler = ""
for crawler in crawler_processes:
count += 1
if crawler.is_alive():
running_crawler = running_crawler + " %s " % str(count)
running = True
print("Running crawler:")
print(running_crawler)
time.sleep(20)
time.sleep(1)
print("Finish crawling")
time.sleep(1)
# Print crawled data
print("Crawled data")
while not crawled_data.empty():
item = crawled_data.get()
print("Page: %s" % item['webname'])
print("Crawled data: %s" % item['title'])
print()
print("FINISH")