-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfparser.py
150 lines (141 loc) · 5.58 KB
/
pdfparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from collections import OrderedDict
import hashlib
from os import listdir
from os.path import isfile, join
import sys
import csv
import os
def extractFirstPageContent(extractedText):
po_start = keys_start = False
current_group = ''
po_dict = {}
x=0
keyholder = []
# Cycle through all the lines
for line in extractedText.split("\n"):
#print(line)
line = line.strip()
if(line == 'DV PROTECTIVE'):
#This needs to be in its own if because we want to capture it into the array
keys_start = True
if(line in ('DV PROTECTIVE','JUVENILE PEACE','PEACE')):
# Set the current group variable
# Add the current group as an eventual key when we save the data. There is probably a better way to do this.
# Then create a new sub array in or overall po_dict with the current group as the key.
current_group = line
keyholder.append(line)
po_dict[current_group] = OrderedDict()
elif(line == 'Count'):
# Means Switch from keys to values
# It also means we need to switch to a new current_group.
po_start = True
keys_start = False
current_group = keyholder[x]
x+=1
elif(line == 'Total All:'):
# End of keys
po_start = False
elif(po_start == True):
#I feel like there's a better way to do this.
# Cycle through the keys in the ordered dictionary for the current group. If the value is a None then we replace it with the value.
# This is how we go one by one.
for key, value in po_dict[current_group].items():
if(value == None):
po_dict[current_group][key] = line
break;
elif(line == 'Sex'):
continue
elif(keys_start == True):
po_dict[current_group][line] = None
return po_dict
def getTextFromFirstPage(filename):
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 1.0
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = ''
po = None
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
extracted_text += lt_obj.get_text()
break
fp.close()
return extracted_text
def appendToCSV(dvdata,county,year,month):
# TODO: Right now you need to have the csvs already made. It'd be nice if it checked and created csvs if they don't exist.
# This is built so there are three seperate csvs that are filled one at a time
dvtypes = ('DV PROTECTIVE','JUVENILE PEACE','PEACE')
path = 'Results\\'
for dvtype in dvtypes:
if(os.path.exists(path + dvtype + ".csv")== False):
with open(path +dvtype + '.csv','a',newline='\n') as fd:
csvwriter = csv.writer(fd, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
writeinfo = ['Year','County','Month','Male','Female','Unknown','Total']
csvwriter.writerow(writeinfo)
if(dvtype in dvdata.keys()):
with open(path +dvtype + '.csv','a',newline='\n') as fd:
csvwriter = csv.writer(fd, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
total = int(dvdata[dvtype].setdefault('MALE',0)) + int(dvdata[dvtype].setdefault('FEMALE',0)) + int(dvdata[dvtype].setdefault('UNKNOWN',0))
writeinfo = [year,county,month,dvdata[dvtype].setdefault('MALE',0),dvdata[dvtype].setdefault('FEMALE',0),dvdata[dvtype].setdefault('UNKNOWN',0),total]
csvwriter.writerow(writeinfo)
else:
# This way we add 0's for months where there are no results
with open(path +dvtype + '.csv','a',newline='\n') as fd:
csvwriter = csv.writer(fd, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
writeinfo = [year,county,month,0,0,0,0]
csvwriter.writerow(writeinfo)
if __name__ == '__main__':
counties = (
'Allegany',
'Carroll',
'Harford',
'Saint_Marys',
'Anne_Arundel',
'Cecil',
'Howard',
'Somerset',
'Baltimore_City',
'Charles',
'Kent',
'Talbot',
'Baltimore',
'Dorchester',
'Montgomery',
'Washington',
'Calvert',
'Frederick',
'Prince_Georges',
'Wicomico',
'Caroline',
'Garrett',
'Queen_Annes',
'Worcester'
)
x = 0
for county in counties:
path = 'DVCases\\' + county + "\\"
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
for filename in onlyfiles:
print(filename)
month = filename[-6:-4].strip("_")
year = filename.split('_')[-2]
# The format changed starting in 2017 and my parser only works for 2018 and above.
if(int(year) >= 2018):
text = getTextFromFirstPage(path + filename)
po=extractFirstPageContent(text)
appendToCSV(po,county,year,month)