-
Notifications
You must be signed in to change notification settings - Fork 0
/
inciweb.py
150 lines (140 loc) · 6.35 KB
/
inciweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import feedparser
import urllib2
import httplib
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
from sqlalchemy.sql import exists
from orm_mapper import FireMap
from fires_orm import Fires
from rss_name_maps import rss_to_db
from Logger import my_logger
from config_methods import config_section_map
inciweb_rss = config_section_map('inciweb')['rss_url']
# open rss feed and parse it
inciweb = feedparser.parse('%s' % inciweb_rss)
if inciweb.bozo:
my_logger("could not open inciweb feed %s" % inciweb.bozo_exception)
# open up a database session (connection pool)
db = FireMap()
# initialize an empty list that will hold all of the objects so we can bulk insert with session.add_all(list)
information_objects_list = []
# iterate over each incident
for idx, incident in enumerate(inciweb.entries):
# this is used to map our objects to the Fires class (the table metadata)
inciweb_details = Fires()
inciweb_details.__setattr__('source', 'inciweb')
# we don't want to collect data on prescribed burns
if incident.title.lower().find('wildfire') == -1:
continue
# Gather some elements from the RSS feed before opening the link and
# Scraping the web page for the remaining elements (columns)
for key, value in incident.iteritems():
formatted_key = rss_to_db(key)
if formatted_key:
if formatted_key == "inciweb_published_date":
value = parse(value).isoformat()
# Sometimes lat and lon are received as "-" or " "
# we need to check for those cases by trying to cast to float
# because they will cause errors when inserting into a Numeric column
if formatted_key == "lat" or formatted_key == "lon":
try:
inciweb_details.__setattr__(formatted_key, float(value))
except Exception:
value = None
inciweb_details.__setattr__(formatted_key, value)
else:
inciweb_details.__setattr__(formatted_key, value.encode("utf-8"))
if hasattr(incident, 'link'):
link = incident.link
try:
# the ID can be found at the end of the URL i.e https://inciweb.nwcg.gov/incident/5409/
# we only want the digits at the end - Using anchors to increase performance
inciweb_id = re.search('^.*/(\d+)/$', link).group(1)
inciweb_details.__setattr__('inciweb_id', inciweb_id.encode("utf-8"))
except AttributeError:
my_logger("Could not parse inciweb id %s" % link)
else:
continue
try:
page = urllib2.urlopen(link)
except urllib2.HTTPError, e:
my_logger('HTTPError = %s - %s' % (str(e.code), link))
continue
except urllib2.URLError, e:
my_logger('URLError = %s - %s' % (str(e.reason), link))
continue
except httplib.HTTPException, e:
my_logger('HTTPException - %s' % link)
continue
except Exception:
import traceback
my_logger('generic exception: ' + traceback.format_exc())
continue
parsed = BeautifulSoup(page, 'html.parser')
content_div = parsed.find_all('div', attrs={'id': 'content'})
if content_div is None:
my_logger("Could not find content div %s" % link)
continue
for tag in content_div:
tables = tag.find_all('table')
# Sometimes, for some odd reason, I can't find any tables in the content div (even though they are there)
# So I just have to search the whole document for tables then iterate over them.
if not tables:
tables = parsed.find_all('table')
elif tables is None:
my_logger("Could not find content tables %s" % link)
continue
for trTag in tables:
rows = trTag.find_all('tr')
if rows is None:
continue
for row in rows:
trLabel = row.find('th').get_text()
trValue = row.find('td')
# parse the javascript date script
if hasattr(trValue, 'contents') and trValue.contents[0].name == 'script':
try:
value = re.search('Date\(\"(.*?)\"\)', trValue.contents[0].text).group(1)
except AttributeError:
my_logger("Could not parse date %s" % link)
# continue to next table row
continue
formatted_key = rss_to_db(trLabel)
if formatted_key:
inciweb_details.__setattr__(formatted_key, value.encode("utf-8"))
else:
formatted_key = rss_to_db(trLabel)
if formatted_key:
if formatted_key == 'acres':
try:
# parse int from string
value = re.search(r'\d+(?:,\d+)?', trValue.text.replace(',', '')).group()
except Exception as e:
value = None
inciweb_details.__setattr__(formatted_key, value)
else:
inciweb_details.__setattr__(formatted_key, trValue.text.encode("utf-8"))
# check to see if this inciweb_id is already in the DB
if db.session.query(exists().where(Fires.inciweb_id == inciweb_details.inciweb_id)).scalar():
# UPDATE
# because we are dynamically iterating over all of the columns and updating the row with the most
# current information, we have to get the ID because that comes from the DB and can't insert Null
id = db.session.query(Fires).filter(Fires.inciweb_id == inciweb_details.inciweb_id).first().id
inciweb_details.__setattr__('id', id)
db.session.query(Fires).filter_by(inciweb_id=inciweb_details.inciweb_id).update(
{column: getattr(inciweb_details, column) for column in Fires.__table__.columns.keys()})
else:
# INSERT
information_objects_list.append(inciweb_details)
try:
db.session.add_all(information_objects_list)
except Exception as e:
my_logger("Could not add Inciweb rows to DB")
print(e)
try:
db.session.commit()
except Exception as e:
my_logger("Could not commit DB session")
print(e)
db.session.close()