-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprocess_filing.py
253 lines (217 loc) · 9.3 KB
/
process_filing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import csv
import sys
import os
import ujson as json
import datetime
import argparse
"""
this will be a dictionary of the sources we've loaded.
since fec2json only deals with one form at a time it'll only
be one version, but to prevent re-loading each sked's headers
over and over, we'll cache the ones we've already loaded
in the FEC_SOURCES global
"""
FEC_SOURCES = {}
PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
CSV_FILE_DIRECTORY = '{}/fec-csv-sources'.format(PROJECT_ROOT)
def process_electronic_filing(path, filing_id=None, dump_full=True, fec_file=False):
#if dump_full is true, you'll get the whole filing, and "itemizations"
#will include all itemizations grouped by category
#otherwise "itemizations" will be in iterator
filing_dict = {}
with open(path, 'r', errors='replace') as f:
if fec_file:
reader = csv.reader(f, delimiter='')
else:
reader = csv.reader(f)
fec_header = next(reader)
fec_version_number = fec_header[2].strip()
#these fields come from the first row of the fec file
filing_dict['record_type'] = list_get(fec_header, 0)
filing_dict['electronic_filing_type'] = list_get(fec_header, 1)
filing_dict['fec_version_number'] = list_get(fec_header, 2)
filing_dict['software_name'] = list_get(fec_header, 3)
filing_dict['software_version'] = list_get(fec_header, 4)
filing_dict['report_id'] = list_get(fec_header, 5)
filing_dict['report_type'] = list_get(fec_header, 6)
filing_dict['header_comment'] = list_get(fec_header, 7)
summary_row = next(reader)
processed_summary = process_summary_row(summary_row, fec_version_number)
assert processed_summary, "Summary could not be processed"
filing_dict.update(processed_summary)
if filing_dict['amendment']:
filing_dict['amends_filing'] = filing_dict['report_id'].replace('FEC-', '')
else:
filing_dict['amends_filing'] = None
itemizations = itemization_iterator(path, filing_id, fec_version_number, fec_file=fec_file)
if dump_full:
filing_dict['itemizations'] = {}
for itemization in itemizations:
form_type = get_itemization_type(itemization.get('form_type'))
if not form_type:
form_type = get_itemization_type(itemization.get('rec_type'))
if not form_type:
continue
if form_type not in filing_dict['itemizations']:
filing_dict['itemizations'][form_type] = []
filing_dict['itemizations'][form_type].append(itemization)
else:
filing_dict['itemizations'] = itemizations
return filing_dict
def itemization_iterator(path, filing_id, fec_version_number, fec_file=False):
with open(path, 'r', errors='replace') as f:
if fec_file:
reader = csv.reader(f, delimiter='')
else:
reader = csv.reader(f)
fec_header = next(reader)
summary_row = next(reader)
while True:
try:
line = next(reader)
except StopIteration:
print("reached end of file")
break
except:
print('bad line')
continue
if line:
form_type = get_itemization_type(line[0])
if not form_type:
print('bad itemization line')
continue
itemization = process_itemization_line(line, fec_version_number)
if not itemization:
print('itemization failed, skipping')
continue
if not filing_id:
try:
filing_id = path.strip('/').split('/')[-1].split('.')[0]
except:
filing_id = None
itemization['filing_id'] = filing_id
yield itemization
def process_summary_row(summary_row, fec_version_number):
#processes the second row of the filing, which is the form summary/topline row
form_type = summary_row[0]
if form_type.endswith('N'):
amendment = False
form = form_type.rstrip('N')
elif form_type.endswith('A'):
amendment = True
form = form_type.rstrip('A')
processed_fields = process_line(summary_row, fec_version_number, form)
if processed_fields:
processed_fields['amendment'] = amendment
processed_fields['form'] = form
return(processed_fields)
def process_itemization_line(line, fec_version_number):
#processes a single itemization row
form_type = get_itemization_type(line[0])
if form_type:
return process_line(line, fec_version_number, form_type)
return None
def get_header_columns(fec_version_number, form_type):
#if we haven't seen this form before, pull the correct version out of fec sources
#note that these files were written to be used with regex
#but this was fast and easy so voila.
#(also you should see the old regex code!)
#but I'll comment this carefully.
#open the fec source for the relevant form
try:
f = open('{}/{}.csv'.format(CSV_FILE_DIRECTORY, form_type), 'r')
except FileNotFoundError:
print('could not find headers for form type {} in {}'.format(form_type, CSV_FILE_DIRECTORY))
raise
csv_headers = csv.reader(f)
versions = next(csv_headers) #this top row lists the fec software versions
i = 0
while i < len(versions):
version_list = versions[i].replace("^", "").split("|") #split the versions by pipe
if fec_version_number in version_list:
#if we find the version we're looking for, set the column number and get our of this loop
col_number = i
break
i += 1
else:
#if we do not break out of the loop, we end up here.
#we should probably write better errors
assert False, "unsupported version of fec file"
header_to_col = {}
#this is going to be a dictionary from header name to column number
for line in csv_headers:
try:
value_column = int(line[col_number])
except ValueError:
#this takes care of the fact that fields for previous or new FEC versions
#are in there with no number for the current version and isn't a concern
continue
header_to_col[line[0]] = value_column
f.close() #let's get out of that file
#add this dictionary to the global FEC_SOURCES dict so we only have to do this once per line type
FEC_SOURCES[form_type] = header_to_col
def process_line(line, fec_version_number, form_type):
#for any line, find the headers for the form type and return the line as a header:value dict
if form_type not in FEC_SOURCES:
try:
get_header_columns(fec_version_number, form_type)
except FileNotFoundError:
return
header_dict = FEC_SOURCES[form_type]
processed_fields = {}
for k, v in header_dict.items():
try:
processed_fields[k] = list_get(line, v-1) or None #turns blanks into nones
except IndexError:
print(header_dict)
print(line)
return(processed_fields)
def get_itemization_type(line_type):
if not line_type:
return None
#figure out the itemization type based on the FEC description of the line
if line_type == "TEXT":
return "TEXT"
if line_type.startswith('SA3L'):
return "SchA3L"
if line_type.startswith('SC1'):
return "SchC1"
if line_type.startswith('SC2'):
return "SchC2"
if line_type.startswith('H'):
return line_type
if line_type.startswith('F'):
return line_type
return "Sch"+line_type[1]
def list_get(l, i, default=None):
#like dict.get, but for a list - returns the item or a default other thing if it doesn't exist
return l[i] if i < len(l) else default
def write_file(outpath, content):
#eventually we'll probably want to make this write to S3 or google
with open(outpath, 'w') as f:
f.write(json.dumps(content, indent=2))
def main():
#do some argparse stuff
parser = argparse.ArgumentParser()
parser.add_argument('--path', help='path to the fec file we want to load')
parser.add_argument('--fecfile', action='store_true', default=False, help='indicates we\'re using a .fec file instead of the fec\'s .csv file. .csv is default and recommended for messy whitespace reasons')
parser.add_argument('--filing_id', help='if not available, assume that filing id is the filename minus the extension.')
args = parser.parse_args()
if args.fecfile:
fec_file=True
print("processing as .fec file")
else:
fec_file=False
print('processing as .csv file')
content = process_electronic_filing(args.path, args.filing_id, fec_file=fec_file)
sys.stdout.write(json.dumps(content))
if __name__=='__main__':
main()
"""
start_time = datetime.datetime.now()
filing_dict = process_electronic_filing('test_csvs/1205803.csv')
write_file('test_csvs/output_test.json', filing_dict)
end_time = datetime.datetime.now()
time_diff = end_time-start_time
print("processing took {} seconds".format(time_diff.seconds))
"""