forked from molgenis/ngs-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convertDoseGonl.py
executable file
·186 lines (147 loc) · 6.04 KB
/
convertDoseGonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#this script converts .dose files formatted in beagle 'single dosage' format, value 0..2.00
#in this format the header has 2 columns per individual for fam+ind id
#each data row has only one column per individual
#using plink you can read this via plink --fam d.fam --dosage a.txt list format=1
#the a.txt file contains a list of all .dose files
#######################
#create argument parser
#######################
import optparse
parser = optparse.OptionParser(usage='usage: %prog -s subsfile -d dosagefile -o outputfile\nChoose option -h for extensive help')
parser.add_option('-s', '--subsetFile', metavar='FILE', help='tab delimited file with two columns defining selectedIds and pseudoIds')
parser.add_option('-d', '--doseFile', metavar='FILE', help='.dose file in beagle single dosage format, i.e. header: SNP A1 A2 F1 I1 F2 I2.. and data rs123 A T 1.99 0.11 (so value per 2 headers)')
parser.add_option('-o', '--outFile', metavar='FILE', help='where the output will be written to')
parser.add_option('-t', '--test', help='when this option is set to True then only 1000 lines will be converted',default=False)
(options, args) = parser.parse_args()
print options
if options.subsetFile==None or options.doseFile==None or options.outFile==None:
parser.print_help()
exit()
###################################################
#read subsetFile into two lists, of old and new ids
###################################################
import csv
csvReader = csv.reader(open(options.subsetFile, 'rb'), delimiter='\t')
selectedIds = []
pseudoIds = {}
for row in csvReader:
selectedIds.append(row[1])
pseudoIds[row[1]]=row[3]
if options.test:
for el in selectedIds:
print el+'='+pseudoIds[el]
##########################################################################################
#iterate through dose file, and rename columns in first row remember what indexes to keep.
#expected header= SNP A1 A2 F1 I1 F2 F2 (two columns per individual)
#expected data=rs1234 A T 1.99 0.69 (one column per individual)
##########################################################################################
csvReader = csv.reader(open(options.doseFile, 'rb'), delimiter='\t', skipinitialspace=True,quoting=csv.QUOTE_NONE)
csvWriter = csv.writer(open(options.outFile, 'wb'), delimiter='\t')
count = 0
selectedColHeaders = [0,1,2]
selectedCols = [0,1,2]
noElements = 0
#check if a familySEPid is split by " " or "\t"
#tab = True
for row in csvReader:
if count == 0:
#if first row then find out what columns to keep and rename those with new ids
print 'filtering data headers'
noElements = len(row)
idx = 0
#select cols
myheader = [row[0],row[1],row[2]]
for col in row:
#print 'testing col: '+col+' on idx '+str(idx)
#[0,1,2] are always included
if idx > 2:
print 'business '+str(idx)
#are the family and identifiers seperated by tab (skip=true) or space?
#family = row[idx-1]
#id = row[idx]
#if " " in col:
# tab = False
id = col.partition(" ")[2]
family = col.partition(" ")[0]
print 'testing id ['+id+']'
if id in selectedIds:
#because of single dose format the data column idx is half of header column idx
#if tab:
# selectedCols.append( 3 + (idx-3)/2)
#else:
selectedCols.append(idx)
print 'mapping '+id
#if tab:
# selectedColHeaders.append(idx-1)
selectedColHeaders.append(idx)
myheader.append(family + " " +pseudoIds[id])
#myheader.append(pseudoIds[id])
else:
print 'skipping '+id
idx = idx+1
#progress monitoring
if idx % 1000 == 0:
print 'filtered header index: index'+str(idx)
#write the header
csvWriter.writerow(myheader)
#debug info
#if tab:
# print 'selected samples: '+str(len(selectedIds))+', available geno samples: '+str((idx-3)/2)+', filtered geno samples: '+str( (len(selectedCols)-3) /2 )
#else:
print 'selected samples: '+str(len(selectedIds))+', available geno samples: '+str(idx - 3)+', filtered geno samples: '+str(len(selectedCols) - 3)
else:
#filter the row into 'myvalues' and write to csv
myvalues = []
for col in selectedCols:
myvalues.append(row[col])
csvWriter.writerow(myvalues)
#debug info
if count % 1000 == 0:
print 'converting row '+str(count)
#for testing purpose you can break early
if options.test and count >= 100:
break
count=count+1
##############################################################################################
#QC: check if the headers in outfile match our selectedIds (and do not contain any previousIds)
##############################################################################################
print 'check of output whether all ids are properly converted'
import os
csvReader = csv.reader(open(options.outFile, 'rb'), delimiter='\t', skipinitialspace=True,quoting=csv.QUOTE_NONE)
foundPseudoIds = []
expectedPseudoIds = pseudoIds.values()
count = 0
for row in csvReader:
if count == 0:
#check headers
idx=0
for col in row:
#for each col >2 should be pseudoIds
if idx >2:
#if not found we delete outfile and give error
id = col.partition(" ")[2]
if id not in expectedPseudoIds:
#os.remove(options.outFile)
print 'conversion FAILED: id \''+col+'\' not a pseudoId'
exit()
#remember id so we can count
foundPseudoIds.append(id)
else:
#verify no original ids are in there
if col in selectedIds:
os.remove(options.outFile)
print 'conversion FAILED: id \''+col+'\' not a pseudoId'
exit()
idx=idx+1
if count == 1:
#check values todo
print 'have to check values still!'
if count > 1:
break
count=count+1
#check for missing
if len(foundPseudoIds) != len(expectedPseudoIds):
for key in pseudoIds.keys():
if pseudoIds[key] not in foundPseudoIds:
print 'WARNING: mapping '+key+'='+pseudoIds[key]+' not in dosage file'
print 'conversion completed'