Merge pull request singnet#182 from singnet/senna-tmp-flybase-1

Fixes column mapping from precomputed tables to SQL tables
logicmoo · May 24, 2023 · 1fa828f · 1fa828f
2 parents e4a7473 + 8ffffc0
commit 1fa828f
Show file tree

Hide file tree

Showing 9 changed files with 318 additions and 50 deletions.
diff --git a/das/canonical_parser.py b/das/canonical_parser.py
@@ -24,6 +24,8 @@ def _file_line_count(file_name):
 HINT_FILE_SIZE = None
 TMP_DIR = '/tmp'
 #TMP_DIR = '/mnt/HD10T/nfs_share/work/tmp'
+SKIP_KEY_VALUE_FILES_GENERATION = False
+SKIP_PARSER = SKIP_KEY_VALUE_FILES_GENERATION or False
 
 class CanonicalParser:
 
@@ -220,8 +222,9 @@ def _populate_redis(self):
 
     def _process_key_value_files(self):
         logger().info(f"Populating Redis")
-        logger().info(f"Building key-value files")
-        self._build_key_value_files()
+        if not SKIP_KEY_VALUE_FILES_GENERATION:
+            logger().info(f"Building key-value files")
+            self._build_key_value_files()
         logger().info(f"Processing key-value files")
         self._populate_redis()
         logger().info(f"Redis is up to date")
@@ -301,6 +304,9 @@ def populate_indexes(self):
 
     def parse(self, path):
         logger().info(f"Parsing {path}")
+        if SKIP_PARSER:
+            logger().info(f"Skipping parser")
+            return
         logger().info(f"Computing file size")
         self.current_line_count = 1
         HINT_FILE_SIZE = _file_line_count(path)

diff --git a/environment b/environment
@@ -2,7 +2,7 @@
 export DAS_REDIS_HOSTNAME=localhost
 export DAS_REDIS_PORT=6380
 # MongoDB variables
-export DAS_MONGODB_HOSTNAME=localhost
+export DAS_MONGODB_HOSTNAME=149.28.201.61
 export DAS_MONGODB_PORT=27018
 # Change the following values when running on a public instance
 export DAS_DATABASE_USERNAME=dbadmin

diff --git a/environment_das b/environment_das
@@ -1,7 +1,7 @@
-export DAS_REDIS_HOSTNAME=149.28.192.132
+export DAS_REDIS_HOSTNAME=45.32.140.218
 export DAS_REDIS_PORT=7000
 
-export DAS_MONGODB_HOSTNAME=45.63.83.31
+export DAS_MONGODB_HOSTNAME=149.28.201.61
 export DAS_MONGODB_PORT=27018
 
 export DAS_DATABASE_USERNAME=dbadmin

diff --git a/flybase2metta/precomputed_tables.py b/flybase2metta/precomputed_tables.py
@@ -1,6 +1,10 @@
 import os
 import glob
 import csv
+import json
+import re
+
+SKIP_FULL_TABLE_COVERAGE_CHECK = True
 
 class Table:
 
@@ -13,6 +17,7 @@ def __init__(self, name):
         self.mapped_fields = set()
         self.unmapped_fields = set()
         self.mapping = {}
+        self.flybase_id_re = re.compile("^(\S+:)?(FB[a-zA-Z]{2}[0-9]{5,10})$")
 
     def set_header(self, header):
         self.header = [h.strip() for h in header]
@@ -23,8 +28,16 @@ def set_header(self, header):
             self.unmapped_fields.add(key)
         assert len(self.unmapped_fields) == len(self.header)
 
-    def add_row(self, row):
-        assert len(self.header) == len(row)
+    def process_row_value(self, v):
+        v = v.strip()
+        m = self.flybase_id_re.search(v)
+        if m is not None:
+            v = m.group(2)
+        return v
+
+    def add_row(self, pre_row):
+        row = [self.process_row_value(value) for value in pre_row]
+        assert len(self.header) == len(row), f"header = {self.header} row = {row}"
         self.rows.append(row)
         for key, value in zip(self.header, row):
             if value:
@@ -45,10 +58,45 @@ def check_field_value(self, sql_table, sql_field, value):
                 tag = tuple([key, value])
                 sql_tag = tuple([sql_table, sql_field])
                 self.covered_by[key][value].add(sql_tag)
-                if all(sql_tag in s for s in self.covered_by[key].values()):
-                    self.unmapped_fields.remove(key)
-                    self.mapped_fields.add(key)
-                    self.mapping[key] = sql_tag
+                if not SKIP_FULL_TABLE_COVERAGE_CHECK:
+                    if all(sql_tag in s for s in self.covered_by[key].values()):
+                        self.unmapped_fields.remove(key)
+                        self.mapped_fields.add(key)
+                        self.mapping[key] = sql_tag
+
+    def print_near_match(self):
+        for key in self.unmapped_fields:
+            tag_count = {}
+            for value in self.covered_by[key]:
+                for sql_tag in self.covered_by[key][value]:
+                    if sql_tag not in tag_count:
+                        tag_count[sql_tag] = 0
+                    tag_count[sql_tag] += 1
+            for tag in tag_count:
+                if (tag_count[tag] / len(self.values[key])) >= 0.8:
+                    table, field = tag
+                    print(f"{(tag_count[tag] / len(self.values[key]))};{self.name};{key};{table};{field}")
+
+    def check_near_match(self):
+        finished = []
+        for key in self.unmapped_fields:
+            tag_count = {}
+            max_count = 0
+            max_tag = None
+            for value in self.covered_by[key]:
+                for sql_tag in self.covered_by[key][value]:
+                    if sql_tag not in tag_count:
+                        tag_count[sql_tag] = 0
+                    tag_count[sql_tag] += 1
+                    if tag_count[sql_tag] > max_count:
+                        max_count = tag_count[sql_tag]
+                        max_tag = sql_tag
+            if max_count > 0 and max_count >= (0.9 * len(self.values[key])):
+                finished.append(tuple([key, max_tag]))
+        for key, tag in finished:
+            self.unmapped_fields.remove(key)
+            self.mapped_fields.add(key)
+            self.mapping[key] = tag
 
     def all_fields_mapped(self):
         return len(self.unmapped_fields) == 0
@@ -64,6 +112,7 @@ def __init__(self, dir_name):
         self.sql_tables = None
         self.preloaded_mapping = False
         os.chdir(dir_name)
+        # This is to output a tsv given an original mappings file (generated by sql_reader)
         #if os.path.exists(f"{dir_name}/mapping.txt"):
         #    with open(f"{dir_name}/mapping.txt", "r") as f:
         #        for line in f:
@@ -79,8 +128,11 @@ def __init__(self, dir_name):
         #                    continue
         #                table, field = tuple(pos.split())
         #                print("\t".join([fname, pre, table, field]))
+        for file_name in glob.glob("ncRNA_genes_*.json"):
+            with open(file_name) as f:
+                json_dict = json.load(f)
+            self._process_ncrna(json_dict)
         for file_name in glob.glob("*.tsv"):
-            #print(file_name)
             table = Table(file_name)
             self.unmapped_tables[file_name] = table
             self.all_tables.append(table)
@@ -152,6 +204,118 @@ def _process_tsv(self, file_name):
                     previous = row
         #self.unmapped_tables[file_name].print_values()
 
+    def _process_ncrna(self, json_dict):
+        known_keys = [
+            "primaryId",
+            "symbol",
+            "sequence",
+            "taxonId",
+            "soTermId",
+            "gene",
+            "symbolSynonyms",
+            "publications",
+            "genomeLocations",
+            "url",
+            "crossReferenceIds",
+            "relatedSequences",
+        ]
+        main_table_header = [
+            "primaryId",
+            "symbol",
+            "sequence",
+            "taxonId",
+            "soTermId",
+            "gene_geneId",
+            "gene_symbol",
+            "gene_locusTag"
+        ]
+        main_table_rows = []
+        synonyms_table_header = ["symbol1", "symbol2"]
+        synomyms_table_rows = []
+        cross_reference_table_header = ["symbol1", "symbol2"]
+        cross_reference_table_rows = []
+        related_sequences_table_header = ["primaryId", "sequenceId", "relationship"]
+        related_sequences_table_rows = []
+        gene_synonyms_table_header = ["symbol1", "symbol2"]
+        gene_synomyms_table_rows = []
+        publications_table_header = ["primaryId", "publication"]
+        publications_table_rows = []
+        genome_locations_table_header = [
+            "primaryId", 
+            "assembly", 
+            "gca_accession", 
+            "INSDC_accession", 
+            "chromosome", 
+            "strand", 
+            "startPosition", 
+            "endPosition"
+        ]
+        genome_locations_table_rows = []
+        for row in json_dict["data"]:
+            for key in row:
+                assert key in known_keys, f"Invalid key: {key}"
+
+            #fbid = row["primaryId"].split(":")[1]
+            fbid = row["primaryId"]
+            symbol = row["symbol"]
+            sequence = row["sequence"]
+            taxonid = row["taxonId"]
+            sotermid = row["soTermId"]
+            gene_geneid = row["gene"]["geneId"]
+            gene_symbol = row["gene"]["symbol"]
+            gene_locustag = row["gene"]["locusTag"]
+            main_table_rows.append([
+                fbid, symbol, sequence, taxonid, sotermid, 
+                gene_geneid, gene_symbol, gene_locustag])
+            if "symbolSynonyms" in row:
+                for synonym in row["symbolSynonyms"]:
+                    synomyms_table_rows.append([symbol, synonym])
+                    synomyms_table_rows.append([synonym, symbol])
+            if "crossReferenceIds" in row:
+                for cross_reference in row["crossReferenceIds"]:
+                    cross_reference_table_rows.append([symbol, cross_reference])
+                    cross_reference_table_rows.append([cross_reference, symbol])
+            if "relatedSequences" in row:
+                for related_sequence in row["relatedSequences"]:
+                    related_sequences_table_rows.append([
+                        fbid, 
+                        related_sequence["sequenceId"], 
+                        related_sequence["relationship"]])
+            if "synonyms" in row["gene"]:
+                for synonym in row["gene"]["synonyms"]:
+                    gene_synomyms_table_rows.append([gene_symbol, synonym])
+                    gene_synomyms_table_rows.append([synonym, gene_symbol])
+            if "publications" in row:
+                for publication in row["publications"]:
+                    publications_table_rows.append([fbid, publication])
+            for genome_location in row["genomeLocations"]:
+                for exon in genome_location["exons"]:
+                    genome_locations_table_rows.append([
+                        fbid,
+                        genome_location["assembly"],
+                        genome_location["gca_accession"],
+                        exon["INSDC_accession"],
+                        exon["chromosome"],
+                        exon["strand"],
+                        str(exon["startPosition"]),
+                        str(exon["endPosition"])])
+        table_list = [
+            ("ncRNA_genes", main_table_header, main_table_rows),
+            ("ncRNA_genes_synonyms", synonyms_table_header, synomyms_table_rows),
+            ("ncRNA_genes_cross_references", cross_reference_table_header, cross_reference_table_rows),
+            ("ncRNA_genes_related_sequences", related_sequences_table_header, related_sequences_table_rows),
+            ("ncRNA_genes_gene_synonyms", gene_synonyms_table_header, gene_synomyms_table_rows),
+            ("ncRNA_genes_publications", publications_table_header, publications_table_rows),
+            ("ncRNA_genes_genome_locations", genome_locations_table_header, genome_locations_table_rows)
+        ]
+        for table_name, header, rows in table_list:
+            table = Table(table_name)
+            table.set_header(header)
+            for row in rows:
+                table.add_row(row)
+            self.unmapped_tables[table_name] = table
+            self.all_tables.append(table)
+
     def set_sql_primary_key(self, sql_table, field):
         self.sql_primary_key[sql_table] = field
 
@@ -172,3 +336,26 @@ def get_relevant_sql_tables(self):
         for table in self.all_tables:
             answer = answer.union(table.get_relevant_sql_tables())
         return answer
+
+    def print_matched_tables(self):
+        for table in self.mapped_tables.values():
+            table.print_near_match()
+        for table in self.unmapped_tables.values():
+            table.print_near_match()
+
+    def check_nearly_matched_tables(self):
+        finished = []
+        for key, table in self.unmapped_tables.items():
+            table.check_near_match()
+            if table.all_fields_mapped():
+                finished.append(key)
+        for key in finished:
+            self.mapped_tables[key] = self.unmapped_tables.pop(key)
+
+    def get_table(self, table_name):
+        if table_name in self.mapped_tables:
+            return self.mapped_tables[table_name]
+        elif table_name in self.unmapped_tables:
+            return self.unmapped_tables[table_name]
+        else:
+            return None