-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadaptRawData.py
32 lines (28 loc) · 1.1 KB
/
adaptRawData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
def _adaptDiplotype(diplotypeText, phenotypeText):
if 'wildtype' in diplotypeText:
diplotypeText = diplotypeText.replace('wildtype', '*1')
if ', xN' in diplotypeText:
diplotypeText = diplotypeText.replace(', xN', 'xN')
if diplotypeText == 'X':
diplotypeText = re.search('\*\d+\/\*\d+', phenotypeText).group(0)
if ' or ' in diplotypeText:
diplotypeText = diplotypeText.split(' or ')
else:
diplotypeText = [diplotypeText]
return diplotypeText
def _adaptPhenotype(gene, phenotypeText):
phenotypeText = phenotypeText.replace('\n', ' ').replace(' ', ' ')
if 'METABOLIZER' in phenotypeText:
phenotypeText = phenotypeText.replace(gene, '').strip()
return phenotypeText
def adaptRawData(rawData):
adaptedData = {}
for gene in rawData:
diplotype = _adaptDiplotype(rawData[gene]['diplotype'], rawData[gene]['phenotype'])
phenotype = _adaptPhenotype(gene, rawData[gene]['phenotype'])
adaptedData[gene] = {
'diplotype': diplotype,
'phenotype': phenotype
}
return adaptedData