Skip to content

Commit

Permalink
Merge pull request singnet#182 from singnet/senna-tmp-flybase-1
Browse files Browse the repository at this point in the history
Fixes column mapping from precomputed tables to SQL tables
  • Loading branch information
andre-senna authored May 24, 2023
2 parents e4a7473 + 8ffffc0 commit 1fa828f
Show file tree
Hide file tree
Showing 9 changed files with 318 additions and 50 deletions.
10 changes: 8 additions & 2 deletions das/canonical_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def _file_line_count(file_name):
HINT_FILE_SIZE = None
TMP_DIR = '/tmp'
#TMP_DIR = '/mnt/HD10T/nfs_share/work/tmp'
SKIP_KEY_VALUE_FILES_GENERATION = False
SKIP_PARSER = SKIP_KEY_VALUE_FILES_GENERATION or False

class CanonicalParser:

Expand Down Expand Up @@ -220,8 +222,9 @@ def _populate_redis(self):

def _process_key_value_files(self):
logger().info(f"Populating Redis")
logger().info(f"Building key-value files")
self._build_key_value_files()
if not SKIP_KEY_VALUE_FILES_GENERATION:
logger().info(f"Building key-value files")
self._build_key_value_files()
logger().info(f"Processing key-value files")
self._populate_redis()
logger().info(f"Redis is up to date")
Expand Down Expand Up @@ -301,6 +304,9 @@ def populate_indexes(self):

def parse(self, path):
logger().info(f"Parsing {path}")
if SKIP_PARSER:
logger().info(f"Skipping parser")
return
logger().info(f"Computing file size")
self.current_line_count = 1
HINT_FILE_SIZE = _file_line_count(path)
Expand Down
2 changes: 1 addition & 1 deletion environment
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
export DAS_REDIS_HOSTNAME=localhost
export DAS_REDIS_PORT=6380
# MongoDB variables
export DAS_MONGODB_HOSTNAME=localhost
export DAS_MONGODB_HOSTNAME=149.28.201.61
export DAS_MONGODB_PORT=27018
# Change the following values when running on a public instance
export DAS_DATABASE_USERNAME=dbadmin
Expand Down
4 changes: 2 additions & 2 deletions environment_das
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
export DAS_REDIS_HOSTNAME=149.28.192.132
export DAS_REDIS_HOSTNAME=45.32.140.218
export DAS_REDIS_PORT=7000

export DAS_MONGODB_HOSTNAME=45.63.83.31
export DAS_MONGODB_HOSTNAME=149.28.201.61
export DAS_MONGODB_PORT=27018

export DAS_DATABASE_USERNAME=dbadmin
Expand Down
201 changes: 194 additions & 7 deletions flybase2metta/precomputed_tables.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import os
import glob
import csv
import json
import re

SKIP_FULL_TABLE_COVERAGE_CHECK = True

class Table:

Expand All @@ -13,6 +17,7 @@ def __init__(self, name):
self.mapped_fields = set()
self.unmapped_fields = set()
self.mapping = {}
self.flybase_id_re = re.compile("^(\S+:)?(FB[a-zA-Z]{2}[0-9]{5,10})$")

def set_header(self, header):
self.header = [h.strip() for h in header]
Expand All @@ -23,8 +28,16 @@ def set_header(self, header):
self.unmapped_fields.add(key)
assert len(self.unmapped_fields) == len(self.header)

def add_row(self, row):
assert len(self.header) == len(row)
def process_row_value(self, v):
v = v.strip()
m = self.flybase_id_re.search(v)
if m is not None:
v = m.group(2)
return v

def add_row(self, pre_row):
row = [self.process_row_value(value) for value in pre_row]
assert len(self.header) == len(row), f"header = {self.header} row = {row}"
self.rows.append(row)
for key, value in zip(self.header, row):
if value:
Expand All @@ -45,10 +58,45 @@ def check_field_value(self, sql_table, sql_field, value):
tag = tuple([key, value])
sql_tag = tuple([sql_table, sql_field])
self.covered_by[key][value].add(sql_tag)
if all(sql_tag in s for s in self.covered_by[key].values()):
self.unmapped_fields.remove(key)
self.mapped_fields.add(key)
self.mapping[key] = sql_tag
if not SKIP_FULL_TABLE_COVERAGE_CHECK:
if all(sql_tag in s for s in self.covered_by[key].values()):
self.unmapped_fields.remove(key)
self.mapped_fields.add(key)
self.mapping[key] = sql_tag

def print_near_match(self):
for key in self.unmapped_fields:
tag_count = {}
for value in self.covered_by[key]:
for sql_tag in self.covered_by[key][value]:
if sql_tag not in tag_count:
tag_count[sql_tag] = 0
tag_count[sql_tag] += 1
for tag in tag_count:
if (tag_count[tag] / len(self.values[key])) >= 0.8:
table, field = tag
print(f"{(tag_count[tag] / len(self.values[key]))};{self.name};{key};{table};{field}")

def check_near_match(self):
finished = []
for key in self.unmapped_fields:
tag_count = {}
max_count = 0
max_tag = None
for value in self.covered_by[key]:
for sql_tag in self.covered_by[key][value]:
if sql_tag not in tag_count:
tag_count[sql_tag] = 0
tag_count[sql_tag] += 1
if tag_count[sql_tag] > max_count:
max_count = tag_count[sql_tag]
max_tag = sql_tag
if max_count > 0 and max_count >= (0.9 * len(self.values[key])):
finished.append(tuple([key, max_tag]))
for key, tag in finished:
self.unmapped_fields.remove(key)
self.mapped_fields.add(key)
self.mapping[key] = tag

def all_fields_mapped(self):
return len(self.unmapped_fields) == 0
Expand All @@ -64,6 +112,7 @@ def __init__(self, dir_name):
self.sql_tables = None
self.preloaded_mapping = False
os.chdir(dir_name)
# This is to output a tsv given an original mappings file (generated by sql_reader)
#if os.path.exists(f"{dir_name}/mapping.txt"):
# with open(f"{dir_name}/mapping.txt", "r") as f:
# for line in f:
Expand All @@ -79,8 +128,11 @@ def __init__(self, dir_name):
# continue
# table, field = tuple(pos.split())
# print("\t".join([fname, pre, table, field]))
for file_name in glob.glob("ncRNA_genes_*.json"):
with open(file_name) as f:
json_dict = json.load(f)
self._process_ncrna(json_dict)
for file_name in glob.glob("*.tsv"):
#print(file_name)
table = Table(file_name)
self.unmapped_tables[file_name] = table
self.all_tables.append(table)
Expand Down Expand Up @@ -152,6 +204,118 @@ def _process_tsv(self, file_name):
previous = row
#self.unmapped_tables[file_name].print_values()

def _process_ncrna(self, json_dict):
known_keys = [
"primaryId",
"symbol",
"sequence",
"taxonId",
"soTermId",
"gene",
"symbolSynonyms",
"publications",
"genomeLocations",
"url",
"crossReferenceIds",
"relatedSequences",
]
main_table_header = [
"primaryId",
"symbol",
"sequence",
"taxonId",
"soTermId",
"gene_geneId",
"gene_symbol",
"gene_locusTag"
]
main_table_rows = []
synonyms_table_header = ["symbol1", "symbol2"]
synomyms_table_rows = []
cross_reference_table_header = ["symbol1", "symbol2"]
cross_reference_table_rows = []
related_sequences_table_header = ["primaryId", "sequenceId", "relationship"]
related_sequences_table_rows = []
gene_synonyms_table_header = ["symbol1", "symbol2"]
gene_synomyms_table_rows = []
publications_table_header = ["primaryId", "publication"]
publications_table_rows = []
genome_locations_table_header = [
"primaryId",
"assembly",
"gca_accession",
"INSDC_accession",
"chromosome",
"strand",
"startPosition",
"endPosition"
]
genome_locations_table_rows = []
for row in json_dict["data"]:
for key in row:
assert key in known_keys, f"Invalid key: {key}"

#fbid = row["primaryId"].split(":")[1]
fbid = row["primaryId"]
symbol = row["symbol"]
sequence = row["sequence"]
taxonid = row["taxonId"]
sotermid = row["soTermId"]
gene_geneid = row["gene"]["geneId"]
gene_symbol = row["gene"]["symbol"]
gene_locustag = row["gene"]["locusTag"]
main_table_rows.append([
fbid, symbol, sequence, taxonid, sotermid,
gene_geneid, gene_symbol, gene_locustag])
if "symbolSynonyms" in row:
for synonym in row["symbolSynonyms"]:
synomyms_table_rows.append([symbol, synonym])
synomyms_table_rows.append([synonym, symbol])
if "crossReferenceIds" in row:
for cross_reference in row["crossReferenceIds"]:
cross_reference_table_rows.append([symbol, cross_reference])
cross_reference_table_rows.append([cross_reference, symbol])
if "relatedSequences" in row:
for related_sequence in row["relatedSequences"]:
related_sequences_table_rows.append([
fbid,
related_sequence["sequenceId"],
related_sequence["relationship"]])
if "synonyms" in row["gene"]:
for synonym in row["gene"]["synonyms"]:
gene_synomyms_table_rows.append([gene_symbol, synonym])
gene_synomyms_table_rows.append([synonym, gene_symbol])
if "publications" in row:
for publication in row["publications"]:
publications_table_rows.append([fbid, publication])
for genome_location in row["genomeLocations"]:
for exon in genome_location["exons"]:
genome_locations_table_rows.append([
fbid,
genome_location["assembly"],
genome_location["gca_accession"],
exon["INSDC_accession"],
exon["chromosome"],
exon["strand"],
str(exon["startPosition"]),
str(exon["endPosition"])])
table_list = [
("ncRNA_genes", main_table_header, main_table_rows),
("ncRNA_genes_synonyms", synonyms_table_header, synomyms_table_rows),
("ncRNA_genes_cross_references", cross_reference_table_header, cross_reference_table_rows),
("ncRNA_genes_related_sequences", related_sequences_table_header, related_sequences_table_rows),
("ncRNA_genes_gene_synonyms", gene_synonyms_table_header, gene_synomyms_table_rows),
("ncRNA_genes_publications", publications_table_header, publications_table_rows),
("ncRNA_genes_genome_locations", genome_locations_table_header, genome_locations_table_rows)
]
for table_name, header, rows in table_list:
table = Table(table_name)
table.set_header(header)
for row in rows:
table.add_row(row)
self.unmapped_tables[table_name] = table
self.all_tables.append(table)

def set_sql_primary_key(self, sql_table, field):
self.sql_primary_key[sql_table] = field

Expand All @@ -172,3 +336,26 @@ def get_relevant_sql_tables(self):
for table in self.all_tables:
answer = answer.union(table.get_relevant_sql_tables())
return answer

def print_matched_tables(self):
for table in self.mapped_tables.values():
table.print_near_match()
for table in self.unmapped_tables.values():
table.print_near_match()

def check_nearly_matched_tables(self):
finished = []
for key, table in self.unmapped_tables.items():
table.check_near_match()
if table.all_fields_mapped():
finished.append(key)
for key in finished:
self.mapped_tables[key] = self.unmapped_tables.pop(key)

def get_table(self, table_name):
if table_name in self.mapped_tables:
return self.mapped_tables[table_name]
elif table_name in self.unmapped_tables:
return self.unmapped_tables[table_name]
else:
return None
Loading

0 comments on commit 1fa828f

Please sign in to comment.