-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_reddit.py
132 lines (103 loc) · 3.59 KB
/
crawl_reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
import urllib2
import json
import time
import psycopg2
import psycopg2.extras
import httplib
import sys
class psql:
def __init__(self):
#
self.db = []
def connect(self):
conn_string = ''
print "Connecting to database\n ->%s" % (conn_string)
self.conn = psycopg2.connect(conn_string)
self.cursor = self.conn.cursor()
def grab(url):
while True:
try:
print 'grabbing ' + url
req = urllib2.Request(url)
req.add_header('User-Agent','crawl bot v0.1')
response = urllib2.urlopen(req)
the_page = response.read()
d = json.loads(the_page)
except (ValueError):
print "ValueError"
time.sleep(10)
continue
except (urllib2.HTTPError, urllib2.URLError):
return None
except httplib.IncompleteRead:
print "incomplete read"
return None
break
return d.get("data")
get_url = 'http://www.reddit.com/r/all/new/.json?sort=new&limit=100&after=t3_'
def main():
db = psql()
db.connect()
g_after = '1vj6ey'
while True:
try:
data = grab(get_url + g_after)
children = data.get("children")
for item in children:
listing = item.get("data")
id = listing.get("id")
subreddit = listing.get("subreddit")
title = listing.get("title")
author = listing.get("author")
created = listing.get("created")
url = listing.get("url")
domain = listing.get("domain")
permalink = listing.get("permalink")
ups = listing.get("ups")
downs = listing.get("downs")
comments = listing.get("num_comments")
scraped = time.time()
insert_data = (id,subreddit,title,author,created,url,domain,permalink,ups,downs,comments,scraped)
print insert_data
try:
db.cursor.execute("INSERT INTO posts VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);", insert_data)
except psycopg2.IntegrityError:
print "Integrity Error!"
db.conn.commit()
db.conn.commit()
next = data.get("after")
if next == None:
break
g_after = next[3:]
print g_after
#bail if reached already committed threads
if base36decode(g_after) <= int('1v6761', 36):
break
#pause and continue
time.sleep(2)
except AttributeError:
after_int = base36decode(g_after)
g_after = base36encode(after_int-50).lower()
time.sleep(2)
continue
#http://en.wikipedia.org/wiki/Base_36#Python_implementation
def base36encode(number, alphabet='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
"""Converts an integer to a base36 string."""
if not isinstance(number, (int, long)):
raise TypeError('number must be an integer')
base36 = ''
sign = ''
if number < 0:
sign = '-'
number = -number
if 0 <= number < len(alphabet):
return sign + alphabet[number]
while number != 0:
number, i = divmod(number, len(alphabet))
base36 = alphabet[i] + base36
return sign + base36
def base36decode(number):
return int(number, 36)
if __name__ == "__main__":
main()