-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessDataBC.py
104 lines (93 loc) · 3.32 KB
/
processDataBC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import pandas as pd
import statistics as st
process = False
try:
trainingFile = sys.argv[1]
process = True
except:
print('Must provide Training CSV file as argument.')
if (process == True):
data = pd.read_csv(trainingFile)
def catReplace(f):
re=st.mode(f)
for i in range(0,len(f)):
if f[i]=='None' or f[i]=='not reported' or f[i]=='stage x':
f[i]=re
return(f)
def quantReplace(f):
ft = []
for i in range(0,len(f)):
if f[i]!='None':
ft.append(float(f[i]))
med = st.median(ft)
for i in range(0,len(f)):
if f[i]=='None':
f[i]=med
return(f)
def quantRace(f):
qf = []
for i in range(0,len(f)):
if f[i]=='white':
qf.append(0)
elif f[i]=="asian":
qf.append(1)
elif f[i]=="black or african american":
qf.append(2)
elif f[i]=="american indian or alaska native":
qf.append(3)
else:
print('race warning: '+f[i])
return(qf)
def quantEthnicity(f):
qf = []
for i in range(0,len(f)):
if f[i]=='not hispanic or latino':
qf.append(0)
elif f[i]=="hispanic or latino":
qf.append(1)
else:
print('ethnicity warning: '+f[i])
return(qf)
def quantDiagnose(f):
qf = []
for i in range(0,len(f)):
qf.append(f[i].replace('C34.',''))
return(qf)
def quantTumor(f):
qf = []
for i in range(0,len(f)):
if f[i]=='stage i' or f[i]=='stage ia':
qf.append(0)
elif f[i]=="stage ib":
qf.append(1)
elif f[i]=='stage ii' or f[i]=='stage iia':
qf.append(2)
elif f[i]=="stage iib":
qf.append(3)
elif f[i]=='stage iii' or f[i]=='stage iiia':
qf.append(4)
elif f[i]=="stage iiib":
qf.append(5)
elif f[i]=='stage iiic':
qf.append(6)
elif f[i]=='stage iva' or f[i]=='stage iv':
qf.append(7)
elif f[i]=="stage ivb":
qf.append(8)
else:
print('tumor warning: '+f[i])
print(len(qf)==len(f))
return(qf)
pd.options.mode.chained_assignment = None
fields_of_interest = ['year_of_birth','race','ethnicity','tumor_stage','age_at_diagnosis','time','survivalEstimate','days_to_last_follow_up','simple_somatic_mutations','genes_with_simple_somatic_mutations']
data['year_of_birth'] = quantReplace(data['year_of_birth'])
data['race']=quantRace(catReplace(data['race']))
data['ethnicity']=quantEthnicity(catReplace(data['ethnicity']))
data['tumor_stage']=quantTumor(catReplace(data['tumor_stage']))
data['age_at_diagnosis']=quantReplace(data['age_at_diagnosis'])
data['days_to_last_follow_up'] = quantReplace(data['days_to_last_follow_up'])
data['simple_somatic_mutations']=quantReplace(data['simple_somatic_mutations'])
data['genes_with_simple_somatic_mutations']=quantReplace(data['genes_with_simple_somatic_mutations'])
data=data[fields_of_interest]
data.to_csv('processedDataBC.csv')