1
1
import os
2
2
import glob
3
3
import csv
4
+ import json
5
+ import re
6
+
7
+ SKIP_FULL_TABLE_COVERAGE_CHECK = True
4
8
5
9
class Table :
6
10
@@ -13,6 +17,7 @@ def __init__(self, name):
13
17
self .mapped_fields = set ()
14
18
self .unmapped_fields = set ()
15
19
self .mapping = {}
20
+ self .flybase_id_re = re .compile ("^(\S+:)?(FB[a-zA-Z]{2}[0-9]{5,10})$" )
16
21
17
22
def set_header (self , header ):
18
23
self .header = [h .strip () for h in header ]
@@ -23,8 +28,16 @@ def set_header(self, header):
23
28
self .unmapped_fields .add (key )
24
29
assert len (self .unmapped_fields ) == len (self .header )
25
30
26
- def add_row (self , row ):
27
- assert len (self .header ) == len (row )
31
+ def process_row_value (self , v ):
32
+ v = v .strip ()
33
+ m = self .flybase_id_re .search (v )
34
+ if m is not None :
35
+ v = m .group (2 )
36
+ return v
37
+
38
+ def add_row (self , pre_row ):
39
+ row = [self .process_row_value (value ) for value in pre_row ]
40
+ assert len (self .header ) == len (row ), f"header = { self .header } row = { row } "
28
41
self .rows .append (row )
29
42
for key , value in zip (self .header , row ):
30
43
if value :
@@ -45,10 +58,45 @@ def check_field_value(self, sql_table, sql_field, value):
45
58
tag = tuple ([key , value ])
46
59
sql_tag = tuple ([sql_table , sql_field ])
47
60
self .covered_by [key ][value ].add (sql_tag )
48
- if all (sql_tag in s for s in self .covered_by [key ].values ()):
49
- self .unmapped_fields .remove (key )
50
- self .mapped_fields .add (key )
51
- self .mapping [key ] = sql_tag
61
+ if not SKIP_FULL_TABLE_COVERAGE_CHECK :
62
+ if all (sql_tag in s for s in self .covered_by [key ].values ()):
63
+ self .unmapped_fields .remove (key )
64
+ self .mapped_fields .add (key )
65
+ self .mapping [key ] = sql_tag
66
+
67
+ def print_near_match (self ):
68
+ for key in self .unmapped_fields :
69
+ tag_count = {}
70
+ for value in self .covered_by [key ]:
71
+ for sql_tag in self .covered_by [key ][value ]:
72
+ if sql_tag not in tag_count :
73
+ tag_count [sql_tag ] = 0
74
+ tag_count [sql_tag ] += 1
75
+ for tag in tag_count :
76
+ if (tag_count [tag ] / len (self .values [key ])) >= 0.8 :
77
+ table , field = tag
78
+ print (f"{ (tag_count [tag ] / len (self .values [key ]))} ;{ self .name } ;{ key } ;{ table } ;{ field } " )
79
+
80
+ def check_near_match (self ):
81
+ finished = []
82
+ for key in self .unmapped_fields :
83
+ tag_count = {}
84
+ max_count = 0
85
+ max_tag = None
86
+ for value in self .covered_by [key ]:
87
+ for sql_tag in self .covered_by [key ][value ]:
88
+ if sql_tag not in tag_count :
89
+ tag_count [sql_tag ] = 0
90
+ tag_count [sql_tag ] += 1
91
+ if tag_count [sql_tag ] > max_count :
92
+ max_count = tag_count [sql_tag ]
93
+ max_tag = sql_tag
94
+ if max_count > 0 and max_count >= (0.9 * len (self .values [key ])):
95
+ finished .append (tuple ([key , max_tag ]))
96
+ for key , tag in finished :
97
+ self .unmapped_fields .remove (key )
98
+ self .mapped_fields .add (key )
99
+ self .mapping [key ] = tag
52
100
53
101
def all_fields_mapped (self ):
54
102
return len (self .unmapped_fields ) == 0
@@ -64,6 +112,7 @@ def __init__(self, dir_name):
64
112
self .sql_tables = None
65
113
self .preloaded_mapping = False
66
114
os .chdir (dir_name )
115
+ # This is to output a tsv given an original mappings file (generated by sql_reader)
67
116
#if os.path.exists(f"{dir_name}/mapping.txt"):
68
117
# with open(f"{dir_name}/mapping.txt", "r") as f:
69
118
# for line in f:
@@ -79,8 +128,11 @@ def __init__(self, dir_name):
79
128
# continue
80
129
# table, field = tuple(pos.split())
81
130
# print("\t".join([fname, pre, table, field]))
131
+ for file_name in glob .glob ("ncRNA_genes_*.json" ):
132
+ with open (file_name ) as f :
133
+ json_dict = json .load (f )
134
+ self ._process_ncrna (json_dict )
82
135
for file_name in glob .glob ("*.tsv" ):
83
- #print(file_name)
84
136
table = Table (file_name )
85
137
self .unmapped_tables [file_name ] = table
86
138
self .all_tables .append (table )
@@ -152,6 +204,118 @@ def _process_tsv(self, file_name):
152
204
previous = row
153
205
#self.unmapped_tables[file_name].print_values()
154
206
207
+ def _process_ncrna (self , json_dict ):
208
+ known_keys = [
209
+ "primaryId" ,
210
+ "symbol" ,
211
+ "sequence" ,
212
+ "taxonId" ,
213
+ "soTermId" ,
214
+ "gene" ,
215
+ "symbolSynonyms" ,
216
+ "publications" ,
217
+ "genomeLocations" ,
218
+ "url" ,
219
+ "crossReferenceIds" ,
220
+ "relatedSequences" ,
221
+ ]
222
+ main_table_header = [
223
+ "primaryId" ,
224
+ "symbol" ,
225
+ "sequence" ,
226
+ "taxonId" ,
227
+ "soTermId" ,
228
+ "gene_geneId" ,
229
+ "gene_symbol" ,
230
+ "gene_locusTag"
231
+ ]
232
+ main_table_rows = []
233
+ synonyms_table_header = ["symbol1" , "symbol2" ]
234
+ synomyms_table_rows = []
235
+ cross_reference_table_header = ["symbol1" , "symbol2" ]
236
+ cross_reference_table_rows = []
237
+ related_sequences_table_header = ["primaryId" , "sequenceId" , "relationship" ]
238
+ related_sequences_table_rows = []
239
+ gene_synonyms_table_header = ["symbol1" , "symbol2" ]
240
+ gene_synomyms_table_rows = []
241
+ publications_table_header = ["primaryId" , "publication" ]
242
+ publications_table_rows = []
243
+ genome_locations_table_header = [
244
+ "primaryId" ,
245
+ "assembly" ,
246
+ "gca_accession" ,
247
+ "INSDC_accession" ,
248
+ "chromosome" ,
249
+ "strand" ,
250
+ "startPosition" ,
251
+ "endPosition"
252
+ ]
253
+ genome_locations_table_rows = []
254
+ for row in json_dict ["data" ]:
255
+ for key in row :
256
+ assert key in known_keys , f"Invalid key: { key } "
257
+
258
+ #fbid = row["primaryId"].split(":")[1]
259
+ fbid = row ["primaryId" ]
260
+ symbol = row ["symbol" ]
261
+ sequence = row ["sequence" ]
262
+ taxonid = row ["taxonId" ]
263
+ sotermid = row ["soTermId" ]
264
+ gene_geneid = row ["gene" ]["geneId" ]
265
+ gene_symbol = row ["gene" ]["symbol" ]
266
+ gene_locustag = row ["gene" ]["locusTag" ]
267
+ main_table_rows .append ([
268
+ fbid , symbol , sequence , taxonid , sotermid ,
269
+ gene_geneid , gene_symbol , gene_locustag ])
270
+ if "symbolSynonyms" in row :
271
+ for synonym in row ["symbolSynonyms" ]:
272
+ synomyms_table_rows .append ([symbol , synonym ])
273
+ synomyms_table_rows .append ([synonym , symbol ])
274
+ if "crossReferenceIds" in row :
275
+ for cross_reference in row ["crossReferenceIds" ]:
276
+ cross_reference_table_rows .append ([symbol , cross_reference ])
277
+ cross_reference_table_rows .append ([cross_reference , symbol ])
278
+ if "relatedSequences" in row :
279
+ for related_sequence in row ["relatedSequences" ]:
280
+ related_sequences_table_rows .append ([
281
+ fbid ,
282
+ related_sequence ["sequenceId" ],
283
+ related_sequence ["relationship" ]])
284
+ if "synonyms" in row ["gene" ]:
285
+ for synonym in row ["gene" ]["synonyms" ]:
286
+ gene_synomyms_table_rows .append ([gene_symbol , synonym ])
287
+ gene_synomyms_table_rows .append ([synonym , gene_symbol ])
288
+ if "publications" in row :
289
+ for publication in row ["publications" ]:
290
+ publications_table_rows .append ([fbid , publication ])
291
+ for genome_location in row ["genomeLocations" ]:
292
+ for exon in genome_location ["exons" ]:
293
+ genome_locations_table_rows .append ([
294
+ fbid ,
295
+ genome_location ["assembly" ],
296
+ genome_location ["gca_accession" ],
297
+ exon ["INSDC_accession" ],
298
+ exon ["chromosome" ],
299
+ exon ["strand" ],
300
+ str (exon ["startPosition" ]),
301
+ str (exon ["endPosition" ])])
302
+ table_list = [
303
+ ("ncRNA_genes" , main_table_header , main_table_rows ),
304
+ ("ncRNA_genes_synonyms" , synonyms_table_header , synomyms_table_rows ),
305
+ ("ncRNA_genes_cross_references" , cross_reference_table_header , cross_reference_table_rows ),
306
+ ("ncRNA_genes_related_sequences" , related_sequences_table_header , related_sequences_table_rows ),
307
+ ("ncRNA_genes_gene_synonyms" , gene_synonyms_table_header , gene_synomyms_table_rows ),
308
+ ("ncRNA_genes_publications" , publications_table_header , publications_table_rows ),
309
+ ("ncRNA_genes_genome_locations" , genome_locations_table_header , genome_locations_table_rows )
310
+ ]
311
+ for table_name , header , rows in table_list :
312
+ table = Table (table_name )
313
+ table .set_header (header )
314
+ for row in rows :
315
+ table .add_row (row )
316
+ self .unmapped_tables [table_name ] = table
317
+ self .all_tables .append (table )
318
+
155
319
def set_sql_primary_key (self , sql_table , field ):
156
320
self .sql_primary_key [sql_table ] = field
157
321
@@ -172,3 +336,26 @@ def get_relevant_sql_tables(self):
172
336
for table in self .all_tables :
173
337
answer = answer .union (table .get_relevant_sql_tables ())
174
338
return answer
339
+
340
+ def print_matched_tables (self ):
341
+ for table in self .mapped_tables .values ():
342
+ table .print_near_match ()
343
+ for table in self .unmapped_tables .values ():
344
+ table .print_near_match ()
345
+
346
+ def check_nearly_matched_tables (self ):
347
+ finished = []
348
+ for key , table in self .unmapped_tables .items ():
349
+ table .check_near_match ()
350
+ if table .all_fields_mapped ():
351
+ finished .append (key )
352
+ for key in finished :
353
+ self .mapped_tables [key ] = self .unmapped_tables .pop (key )
354
+
355
+ def get_table (self , table_name ):
356
+ if table_name in self .mapped_tables :
357
+ return self .mapped_tables [table_name ]
358
+ elif table_name in self .unmapped_tables :
359
+ return self .unmapped_tables [table_name ]
360
+ else :
361
+ return None
0 commit comments