-
Notifications
You must be signed in to change notification settings - Fork 1
/
watchbuy.py
194 lines (138 loc) · 6.81 KB
/
watchbuy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
'''
Watchbuyer's Companion
Author: Jasen Chan
TODO:
- add regular expressions for crown&caliber
- show which marketplace has better prices
- change prices for new, box, papers etc
- OPTIMIZE: this is just a barebones working version of the program; many optimizations can be made on the regexs, and possibly on stringbuilding
COMPLETED:
- ** fix bug where searches don't work for models chrono24 already has defined...
- fix 'orient bambino glitch'
- solution: problem was that old regex wouldn't work when there weren't commas in the price; new regex created
- when searching e.g. 'omega seamaster', chrono24 will redirect from the search url to another url with the watch brand and model defined
- add convert to price string i.e. '12333' -> '$12,333'
'''
import re
import urllib2
siteRegex = {
'chrono24':{
'redirect':'\-\-mod\d*(?!\&)',
'totalResults':'(?<=<span class=\"total-count\">)(?<!,)(\d{1,3}(?:,\d{3})*)(?!,)',
'itemSplit':'<div class=\"article-item-container\">[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*[\r\n]*.*',
'priceSplit':'(?<=<span class=\"currency\">\$<\/span>)(?<!,)(\d{1,3}(?:,\d{3})*)(?!,)'
}
}
siteURLs = {
'chrono24':{
'main':'https://www.chrono24.com/search/index.htm?accessoryTypes=&dosearch=true&query=',
'pageIndex':'&showpage=',
'condition_new':'&usedOrNew=new',
'condition_used':'&usedOrNew=used',
'condition_all':'&usedOrNew=used&usedOrNew=new'
}
}
def commaSeparatedPrice(num): # TODO: make this function take into account decimal places or standardize the way decimal places are handled
numList = list(format(num,'0.2f'))
counter = 1
numCommas = 0
for i in range(0,len(numList) - 4): # -4 to take into account decimal and padding
if counter % 3 == 0:
numList.insert(len(numList) - 4 - i - numCommas, ',')
counter = 1
numCommas = numCommas + 1
continue
counter = counter + 1
return "".join(numList)
# convert thousands separated number e.g. '1,234' to int e.g. '1234'
def commaStringToInt(value):
return int("".join(str(value).split(",")))
# get average of a list
def getAverage(valueList):
return sum(valueList)/len(valueList)
# get the median value of a list
def getMedian(valueList):
valueList.sort()
if len(valueList) < 1:
return 0
valueList.sort()
centerIndexFloat = len(valueList)/2.0
centerIndexInt = len(valueList)/2
if centerIndexFloat > centerIndexInt:
return valueList[centerIndexInt]
else:
return (valueList[centerIndexInt] + valueList[centerIndexInt-1])/2.0
# scrape prices recursively off of a url; currently configured for chrono24; returns a list with 2 values: [# of valid watches found, summed price of all watches]
def scrapePrices(searchString,iteration,sourceSite,condition='condition_all',numIncluded=0,totalPrice=0):
# reference global variables
global allPrices, allPrices_new, allPrices_old
# reformat the query from "xx xx xxx" to "xx+xx+xxx"
searchStringReformatted = "+".join(searchString.split(' '))
if sourceSite == 'chrono24':
targetURL = siteURLs['chrono24']['main'] + searchStringReformatted + siteURLs['chrono24']['pageIndex'] + str(iteration) + siteURLs['chrono24'][condition]
#print "Attempting to scrape from page " + targetURL
req = urllib2.Request(targetURL, headers = {"User-Agent": "Mozilla/5.0"})
urlData = urllib2.urlopen(req)
# handle case where target page gets redirected
if sourceSite == 'chrono24':
if len(re.findall(siteRegex[sourceSite]['redirect'],urlData.geturl())) > 0:
#print "Redirected to " + urlData.geturl()
splitUrl = re.split(siteRegex[sourceSite]['redirect'],urlData.geturl())
targetMod = re.search(siteRegex[sourceSite]['redirect'],urlData.geturl()).group(0)
targetURL = splitUrl[0] + targetMod + "-" + str(iteration) + splitUrl[1]
#print "Attempting to scrape from page " + targetURL
urlData = urllib2.urlopen(targetURL)
rawData = urlData.read()
# create progress update
totalResultsCount = re.findall(siteRegex[sourceSite]['totalResults'],rawData)
# handle case where page after last has no counts
try:
totalResultsCount = totalResultsCount[0]
except IndexError:
totalResultsCount = '0'
# display percentage completed
if iteration == 1:
totalResults = commaStringToInt(totalResultsCount)
print "Found a total of " + str(totalResults) + (" watch" if totalResults == 1 else " watches")
else:
totalResults = commaStringToInt(totalResultsCount)
print "Analyzed " + str(numIncluded) + " out of " + str(totalResults) + (" watch" if totalResults == 1 else " watches") + "(" + format(100 * (float(numIncluded)/float((totalResults if totalResults > 0 else 1))),'0.1f') + "%)"
# split the target watches on the page with all data; ends at article-price-container div
pageItemsSplit = re.findall(siteRegex[sourceSite]['itemSplit'],rawData)
#print pageItemsSplit, len(pageItemsSplit)
# turn the split items into a single string for further regex analysis
pageItemsString = "".join(pageItemsSplit)
# parse the prices from the containers
rawPrices = re.findall(siteRegex[sourceSite]['priceSplit'],pageItemsString)
# reformat the prices from '\d\d,\d\d\d' to int \d\d\d\d\d
prices = [commaStringToInt(i) for i in rawPrices]
allPrices += prices
#print prices, len(prices), iteration, numIncluded, totalPrice
print "Analyzing " + str(len(prices)) + " watches on page " + str(iteration) + "..."
# if watch prices are returned, recursively call this function again
if len(prices) <= 0:
return [numIncluded,totalPrice]
#print iteration, numIncluded, totalPrice
return scrapePrices(searchString, iteration+1, sourceSite, condition, numIncluded + len(prices), totalPrice + sum(prices))
def generateReport(data):
print "Average price of " + str(data[0]) + " \'" + query + "\' watch" + ("" if data[0] == 1 else "es") + " is $" + commaSeparatedPrice(data[1]/(data[0] if data[0] > 0 else 1))
#print allPrices
#getMedian(allPrices)
print "Median price of " + str(data[0]) + " \'" + query + "\' watch" + ("" if data[0] == 1 else "es") + " is $" + commaSeparatedPrice(getMedian(allPrices))
print "Lowest price of " + str(data[0]) + " \'" + query + "\' watch" + ("" if data[0] == 1 else "es") + " is $" + commaSeparatedPrice(allPrices[0] if len(allPrices) > 0 else 0)
print "Highest price of " + str(data[0]) + " \'" + query + "\' watch" + ("" if data[0] == 1 else "es") + " is $" + commaSeparatedPrice(allPrices[len(allPrices)-1] if len(allPrices) > 0 else 0)
# declare global list with all the prices seen
allPrices = []
# get input query
query = raw_input("Query: ")
# scrape website
print "Initiation chrono24 scrape..."
# scrape new watches
data_new = scrapePrices(query, 1, 'chrono24','condition_new')
print "New " + query + " watches"
generateReport(data_new)
# scrape used watches
allPrices = []
data_used = scrapePrices(query, 1, 'chrono24', 'condition_used')
print "Used " + query + " watches"
generateReport(data_used)