forked from tamslo/gims-parsing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetRawData.py
30 lines (26 loc) · 822 Bytes
/
getRawData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pdfplumber
def _getGenotypeTable(pdfPath):
genotypeTable = None
with pdfplumber.open(pdfPath) as pdf:
for page in pdf.pages:
pageTables = page.extract_tables()
for table in pageTables:
tableHeader = table[0]
if 'Genotyp' in tableHeader:
genotypeTable = table
break
if genotypeTable is not None:
break
return genotypeTable
def getRawData(pdfPath):
genotypeTable = _getGenotypeTable(pdfPath)
rawData = {}
for row in genotypeTable[1:]:
gene = row[0]
diplotypeText = row[1]
phenotypeText = row[2]
rawData[gene] = {
'diplotype': diplotypeText,
'phenotype': phenotypeText
}
return rawData