-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl2_test.py
86 lines (71 loc) · 1.98 KB
/
crawl2_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
'''
Web Crawler for Shopping.com
'''
import sys
import os
from lxml import html
import requests
import string
import unittest
class featureCollect:
def __init__(self):
self.productTree = dict()
self.count = 0
def collect (self, url):
headers = {'user-agent': 'Mozilla/5.0 (compatible; CrawlBot/2.1; +http://www.google.com/bot.html)'}
page = requests.get(url, headers=headers)
tree = html.fromstring(page.text)
productList = tree.find_class ("productName")
count = len(productList)
if count > 0 :
self.count = self.count + count
return 1
else :
return 0
def directCollect(self, url):
page = requests.get(url)
tree = html.fromstring(page.text)
productCount = tree.find_class ("numTotalResults")
for product in productCount:
cnt = product.xpath('text()')
self.count = cnt[0].split()[len(cnt[0].split()) -1]
def crawl(page, product):
url = "http://www.shopping.com/products"
collector = featureCollect()
if page == 0:
args = 1
else:
args = 2
#args = len(sys.argv) - 1
if args == 1 :
url1 = url + '?KW=' + product
collector.collect(url1)
next = 1
x = 2
while (next == 1):
#print 'page ::: ', x - 1
url1 = url + '~PG-' + str(x) + '?KW=' + product
next = collector.collect (url1)
x = x + 1
elif args == 2 :
url1 = url + '~PG-' + str(page) + '?KW=' + product
collector.collect(url1)
else :
print 'provide one article name as argument or provide page number with article name::eg: 2 deo'
print ':::::: Total product count ::::::'
print collector.count
class parseTests(unittest.TestCase):
def testOne(self):
self.failUnless(crawl(2, "deo"))
def testTwo(self):
self.failIf(crawl(0, "deo"))
# def testThree(self):
# self.failIf(parse("ldap://[2001:db8::7]/c=GB?objectClass?one"))
# def testFour(self):
# self.failIf(parse("https://www.ietf.org/rfc/index.html?user=1#page"))
# def testFive(self):
# self.failIf(parse("http://www.ietf.org/rfc/rfc2396.txt"))
def main():
unittest.main()
if __name__ == "__main__":
main()