Skip to content

Commit 1fa828f

Browse files
authored
Merge pull request singnet#182 from singnet/senna-tmp-flybase-1
Fixes column mapping from precomputed tables to SQL tables
2 parents e4a7473 + 8ffffc0 commit 1fa828f

File tree

9 files changed

+318
-50
lines changed

9 files changed

+318
-50
lines changed

das/canonical_parser.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def _file_line_count(file_name):
2424
HINT_FILE_SIZE = None
2525
TMP_DIR = '/tmp'
2626
#TMP_DIR = '/mnt/HD10T/nfs_share/work/tmp'
27+
SKIP_KEY_VALUE_FILES_GENERATION = False
28+
SKIP_PARSER = SKIP_KEY_VALUE_FILES_GENERATION or False
2729

2830
class CanonicalParser:
2931

@@ -220,8 +222,9 @@ def _populate_redis(self):
220222

221223
def _process_key_value_files(self):
222224
logger().info(f"Populating Redis")
223-
logger().info(f"Building key-value files")
224-
self._build_key_value_files()
225+
if not SKIP_KEY_VALUE_FILES_GENERATION:
226+
logger().info(f"Building key-value files")
227+
self._build_key_value_files()
225228
logger().info(f"Processing key-value files")
226229
self._populate_redis()
227230
logger().info(f"Redis is up to date")
@@ -301,6 +304,9 @@ def populate_indexes(self):
301304

302305
def parse(self, path):
303306
logger().info(f"Parsing {path}")
307+
if SKIP_PARSER:
308+
logger().info(f"Skipping parser")
309+
return
304310
logger().info(f"Computing file size")
305311
self.current_line_count = 1
306312
HINT_FILE_SIZE = _file_line_count(path)

environment

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
export DAS_REDIS_HOSTNAME=localhost
33
export DAS_REDIS_PORT=6380
44
# MongoDB variables
5-
export DAS_MONGODB_HOSTNAME=localhost
5+
export DAS_MONGODB_HOSTNAME=149.28.201.61
66
export DAS_MONGODB_PORT=27018
77
# Change the following values when running on a public instance
88
export DAS_DATABASE_USERNAME=dbadmin

environment_das

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
export DAS_REDIS_HOSTNAME=149.28.192.132
1+
export DAS_REDIS_HOSTNAME=45.32.140.218
22
export DAS_REDIS_PORT=7000
33

4-
export DAS_MONGODB_HOSTNAME=45.63.83.31
4+
export DAS_MONGODB_HOSTNAME=149.28.201.61
55
export DAS_MONGODB_PORT=27018
66

77
export DAS_DATABASE_USERNAME=dbadmin

flybase2metta/precomputed_tables.py

Lines changed: 194 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import os
22
import glob
33
import csv
4+
import json
5+
import re
6+
7+
SKIP_FULL_TABLE_COVERAGE_CHECK = True
48

59
class Table:
610

@@ -13,6 +17,7 @@ def __init__(self, name):
1317
self.mapped_fields = set()
1418
self.unmapped_fields = set()
1519
self.mapping = {}
20+
self.flybase_id_re = re.compile("^(\S+:)?(FB[a-zA-Z]{2}[0-9]{5,10})$")
1621

1722
def set_header(self, header):
1823
self.header = [h.strip() for h in header]
@@ -23,8 +28,16 @@ def set_header(self, header):
2328
self.unmapped_fields.add(key)
2429
assert len(self.unmapped_fields) == len(self.header)
2530

26-
def add_row(self, row):
27-
assert len(self.header) == len(row)
31+
def process_row_value(self, v):
32+
v = v.strip()
33+
m = self.flybase_id_re.search(v)
34+
if m is not None:
35+
v = m.group(2)
36+
return v
37+
38+
def add_row(self, pre_row):
39+
row = [self.process_row_value(value) for value in pre_row]
40+
assert len(self.header) == len(row), f"header = {self.header} row = {row}"
2841
self.rows.append(row)
2942
for key, value in zip(self.header, row):
3043
if value:
@@ -45,10 +58,45 @@ def check_field_value(self, sql_table, sql_field, value):
4558
tag = tuple([key, value])
4659
sql_tag = tuple([sql_table, sql_field])
4760
self.covered_by[key][value].add(sql_tag)
48-
if all(sql_tag in s for s in self.covered_by[key].values()):
49-
self.unmapped_fields.remove(key)
50-
self.mapped_fields.add(key)
51-
self.mapping[key] = sql_tag
61+
if not SKIP_FULL_TABLE_COVERAGE_CHECK:
62+
if all(sql_tag in s for s in self.covered_by[key].values()):
63+
self.unmapped_fields.remove(key)
64+
self.mapped_fields.add(key)
65+
self.mapping[key] = sql_tag
66+
67+
def print_near_match(self):
68+
for key in self.unmapped_fields:
69+
tag_count = {}
70+
for value in self.covered_by[key]:
71+
for sql_tag in self.covered_by[key][value]:
72+
if sql_tag not in tag_count:
73+
tag_count[sql_tag] = 0
74+
tag_count[sql_tag] += 1
75+
for tag in tag_count:
76+
if (tag_count[tag] / len(self.values[key])) >= 0.8:
77+
table, field = tag
78+
print(f"{(tag_count[tag] / len(self.values[key]))};{self.name};{key};{table};{field}")
79+
80+
def check_near_match(self):
81+
finished = []
82+
for key in self.unmapped_fields:
83+
tag_count = {}
84+
max_count = 0
85+
max_tag = None
86+
for value in self.covered_by[key]:
87+
for sql_tag in self.covered_by[key][value]:
88+
if sql_tag not in tag_count:
89+
tag_count[sql_tag] = 0
90+
tag_count[sql_tag] += 1
91+
if tag_count[sql_tag] > max_count:
92+
max_count = tag_count[sql_tag]
93+
max_tag = sql_tag
94+
if max_count > 0 and max_count >= (0.9 * len(self.values[key])):
95+
finished.append(tuple([key, max_tag]))
96+
for key, tag in finished:
97+
self.unmapped_fields.remove(key)
98+
self.mapped_fields.add(key)
99+
self.mapping[key] = tag
52100

53101
def all_fields_mapped(self):
54102
return len(self.unmapped_fields) == 0
@@ -64,6 +112,7 @@ def __init__(self, dir_name):
64112
self.sql_tables = None
65113
self.preloaded_mapping = False
66114
os.chdir(dir_name)
115+
# This is to output a tsv given an original mappings file (generated by sql_reader)
67116
#if os.path.exists(f"{dir_name}/mapping.txt"):
68117
# with open(f"{dir_name}/mapping.txt", "r") as f:
69118
# for line in f:
@@ -79,8 +128,11 @@ def __init__(self, dir_name):
79128
# continue
80129
# table, field = tuple(pos.split())
81130
# print("\t".join([fname, pre, table, field]))
131+
for file_name in glob.glob("ncRNA_genes_*.json"):
132+
with open(file_name) as f:
133+
json_dict = json.load(f)
134+
self._process_ncrna(json_dict)
82135
for file_name in glob.glob("*.tsv"):
83-
#print(file_name)
84136
table = Table(file_name)
85137
self.unmapped_tables[file_name] = table
86138
self.all_tables.append(table)
@@ -152,6 +204,118 @@ def _process_tsv(self, file_name):
152204
previous = row
153205
#self.unmapped_tables[file_name].print_values()
154206

207+
def _process_ncrna(self, json_dict):
208+
known_keys = [
209+
"primaryId",
210+
"symbol",
211+
"sequence",
212+
"taxonId",
213+
"soTermId",
214+
"gene",
215+
"symbolSynonyms",
216+
"publications",
217+
"genomeLocations",
218+
"url",
219+
"crossReferenceIds",
220+
"relatedSequences",
221+
]
222+
main_table_header = [
223+
"primaryId",
224+
"symbol",
225+
"sequence",
226+
"taxonId",
227+
"soTermId",
228+
"gene_geneId",
229+
"gene_symbol",
230+
"gene_locusTag"
231+
]
232+
main_table_rows = []
233+
synonyms_table_header = ["symbol1", "symbol2"]
234+
synomyms_table_rows = []
235+
cross_reference_table_header = ["symbol1", "symbol2"]
236+
cross_reference_table_rows = []
237+
related_sequences_table_header = ["primaryId", "sequenceId", "relationship"]
238+
related_sequences_table_rows = []
239+
gene_synonyms_table_header = ["symbol1", "symbol2"]
240+
gene_synomyms_table_rows = []
241+
publications_table_header = ["primaryId", "publication"]
242+
publications_table_rows = []
243+
genome_locations_table_header = [
244+
"primaryId",
245+
"assembly",
246+
"gca_accession",
247+
"INSDC_accession",
248+
"chromosome",
249+
"strand",
250+
"startPosition",
251+
"endPosition"
252+
]
253+
genome_locations_table_rows = []
254+
for row in json_dict["data"]:
255+
for key in row:
256+
assert key in known_keys, f"Invalid key: {key}"
257+
258+
#fbid = row["primaryId"].split(":")[1]
259+
fbid = row["primaryId"]
260+
symbol = row["symbol"]
261+
sequence = row["sequence"]
262+
taxonid = row["taxonId"]
263+
sotermid = row["soTermId"]
264+
gene_geneid = row["gene"]["geneId"]
265+
gene_symbol = row["gene"]["symbol"]
266+
gene_locustag = row["gene"]["locusTag"]
267+
main_table_rows.append([
268+
fbid, symbol, sequence, taxonid, sotermid,
269+
gene_geneid, gene_symbol, gene_locustag])
270+
if "symbolSynonyms" in row:
271+
for synonym in row["symbolSynonyms"]:
272+
synomyms_table_rows.append([symbol, synonym])
273+
synomyms_table_rows.append([synonym, symbol])
274+
if "crossReferenceIds" in row:
275+
for cross_reference in row["crossReferenceIds"]:
276+
cross_reference_table_rows.append([symbol, cross_reference])
277+
cross_reference_table_rows.append([cross_reference, symbol])
278+
if "relatedSequences" in row:
279+
for related_sequence in row["relatedSequences"]:
280+
related_sequences_table_rows.append([
281+
fbid,
282+
related_sequence["sequenceId"],
283+
related_sequence["relationship"]])
284+
if "synonyms" in row["gene"]:
285+
for synonym in row["gene"]["synonyms"]:
286+
gene_synomyms_table_rows.append([gene_symbol, synonym])
287+
gene_synomyms_table_rows.append([synonym, gene_symbol])
288+
if "publications" in row:
289+
for publication in row["publications"]:
290+
publications_table_rows.append([fbid, publication])
291+
for genome_location in row["genomeLocations"]:
292+
for exon in genome_location["exons"]:
293+
genome_locations_table_rows.append([
294+
fbid,
295+
genome_location["assembly"],
296+
genome_location["gca_accession"],
297+
exon["INSDC_accession"],
298+
exon["chromosome"],
299+
exon["strand"],
300+
str(exon["startPosition"]),
301+
str(exon["endPosition"])])
302+
table_list = [
303+
("ncRNA_genes", main_table_header, main_table_rows),
304+
("ncRNA_genes_synonyms", synonyms_table_header, synomyms_table_rows),
305+
("ncRNA_genes_cross_references", cross_reference_table_header, cross_reference_table_rows),
306+
("ncRNA_genes_related_sequences", related_sequences_table_header, related_sequences_table_rows),
307+
("ncRNA_genes_gene_synonyms", gene_synonyms_table_header, gene_synomyms_table_rows),
308+
("ncRNA_genes_publications", publications_table_header, publications_table_rows),
309+
("ncRNA_genes_genome_locations", genome_locations_table_header, genome_locations_table_rows)
310+
]
311+
for table_name, header, rows in table_list:
312+
table = Table(table_name)
313+
table.set_header(header)
314+
for row in rows:
315+
table.add_row(row)
316+
self.unmapped_tables[table_name] = table
317+
self.all_tables.append(table)
318+
155319
def set_sql_primary_key(self, sql_table, field):
156320
self.sql_primary_key[sql_table] = field
157321

@@ -172,3 +336,26 @@ def get_relevant_sql_tables(self):
172336
for table in self.all_tables:
173337
answer = answer.union(table.get_relevant_sql_tables())
174338
return answer
339+
340+
def print_matched_tables(self):
341+
for table in self.mapped_tables.values():
342+
table.print_near_match()
343+
for table in self.unmapped_tables.values():
344+
table.print_near_match()
345+
346+
def check_nearly_matched_tables(self):
347+
finished = []
348+
for key, table in self.unmapped_tables.items():
349+
table.check_near_match()
350+
if table.all_fields_mapped():
351+
finished.append(key)
352+
for key in finished:
353+
self.mapped_tables[key] = self.unmapped_tables.pop(key)
354+
355+
def get_table(self, table_name):
356+
if table_name in self.mapped_tables:
357+
return self.mapped_tables[table_name]
358+
elif table_name in self.unmapped_tables:
359+
return self.unmapped_tables[table_name]
360+
else:
361+
return None

0 commit comments

Comments
 (0)