-
Notifications
You must be signed in to change notification settings - Fork 0
/
conservative_party.py
142 lines (118 loc) · 4.91 KB
/
conservative_party.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import dateutil.parser
from base import BaseScraper
import re
import urllib2
import lxml.html
from subprocess import Popen, PIPE, STDOUT
import email
import email.feedparser
import tempfile
# Scrape press release archive from the conservative party website.
#
# The site commits a litany of atrocities, which makes it a good
# example of a typical web site.
#
# Requirements:
# antiword (word->text converter)
# Email::Outlook::Message (CPAN module)
#
# As if press releases in word docs weren't bad enough, a lot (~75%?)
# of the conservative releases are in .msg files, which is an
# undocumented microsoft exchange (or outlook?) format.
#
# Luckily, Email::Outlook::Message in CPAN can parse them and turn
# them into something sane.
#
# To install it from CPAN (please forgive the perl-naivite here):
#
# $ sudo perl -MCPAN -e shell
# cpan> install Email::Outlook::Message
#
# it depends in turn on OLE::Storage_Lite and IO:All - cpan shell
# should ask you if you want to install them first.
#
class Scraper(BaseScraper):
long_name = "The Conservative Party"
def run(self):
# grab the latest 100
start_url = 'http://www.conservatives.com/Activist_centre/Press_Releases.aspx?take=100'
links_page = self.get_url(start_url)
for link in links_page.cssselect('.results .clfx h2 a'):
page_url = link.get('href')
title = unicode(link.text_content()).strip()
self.extract(page_url)
continue
def extract(self,url):
html = urllib2.urlopen(url).read()
# convert to unicode. page meta tags _claim_ page is iso-8859-1, but
# http headers say it's utf-8. sigh.
html = html.decode('utf-8') # (The headers are right.)
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)
date_txt = unicode(doc.cssselect(".lg-content .info")[0].text_content()).strip()
title = unicode(doc.cssselect(".lg-content h1")[0].text_content()).strip()
published = dateutil.parser.parse(date_txt)
# print " title: ",title
# print " date: ",published
# annoyingly, the press releases can be either:
# - on the page, as you'd expect
# - attached as a word doc. grr.
# - attached as a .msg file. WTF?
attached = doc.cssselect('.lnklist a')
txt = u''
if len(attached)>0:
# press release in attachement
attachment_url = attached[0].get('href')
txt = self.text_from_attachment(attachment_url)
else:
# press release on page
maintxt = doc.cssselect('.main-txt')[0]
for cruft in maintxt.cssselect('.botlnks'):
cruft.getparent().remove(cruft)
txt = unicode(maintxt.text_content())
self.upsert_press_release({
'published' : published,
'title' : title,
'text' : txt,
'source_link' : url,
})
def text_from_attachment(self,attachment_url):
attachment_url = attachment_url.replace(' ','%20')
f = urllib2.urlopen(attachment_url)
data = f.read()
m = re.compile(r'filename=(.*)').search(f.info()['Content-Disposition'])
filename = m.group(1)
if filename.lower().endswith('.doc'):
# word file - convert to text.
p = Popen(['antiword', '-m', 'UTF-8', '-'], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
txt = p.communicate(input=data)[0]
# kill the table at the top
pat = re.compile('^\s*[|].*?$',re.DOTALL|re.MULTILINE)
txt = pat.sub('',txt).strip()
txt = unicode(txt,'utf-8')
return txt
if filename.lower().endswith('.msg'):
# it's an exchange ".msg" file! oh, the humanity...
# first step - extract from silly .msg format
f = tempfile.NamedTemporaryFile()
f.write(data)
f.flush()
perlcmd = """use Email::Outlook::Message; print new Email::Outlook::Message('""" + f.name + """')->to_email_mime->as_string;"""
devnull = open('/dev/null', 'w')
p = Popen(['perl', '-w', '-e', perlcmd], stdout=PIPE, stdin=PIPE, stderr=devnull)
mimedata = p.communicate()[0]
f.close()
# second step - find the email text (just look for the first 'text/plain' part)
msg = email.message_from_string(mimedata)
for part in msg.walk():
if part.get_content_type()=='text/plain':
txt = part.get_payload()
# txt = txt.decode(part.get_content_charset())
txt = txt.decode('utf-8',"ignore") # KLUDGE!
txt = txt.strip()
return txt
assert False # uhoh - couldn't get email text
assert False # shouldn't get this far...
if __name__ == "__main__":
scraper = Scraper()
scraper.run()