Added flask-slqa, declarative models to the project. Added taxon tabl…

…es and instructions on how to load that data.
ETH-NEXUS · Sep 23, 2019 · 4621881 · 4621881
1 parent 0c50f0a
commit 4621881
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 103 deletions.
diff --git a/data/.gitignore b/data/.gitignore
@@ -0,0 +1,4 @@
+*.dump
+*.dmp
+*.prt
+readme.txt
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,9 @@
+# Taxon Data
+
+Download taxdump.zip/.tar.gz from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ and extract it to this folder. Only nodes.dmp and
+names.dmp are currently used.
+
+If sequencer.default_settings.LOAD_TAXON_DATA is true, this data will be loaded into taxonomy tables in the app.
+(Note that, as of this writing, these tables are unused, so this is an entirely optional step.)
+
+Refer to readme.txt (supplied in the zip) for more information about these files.
diff --git a/sequencer/__init__.py b/sequencer/__init__.py
@@ -13,10 +13,12 @@ def create_app(test_config=None):
     app.config.from_object('sequencer.default_settings')
     app.config.from_mapping(
         SECRET_KEY='dev',
-        DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'),
+        # DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'),
         BLAST_DB_DIR=os.path.join(app.root_path, 'blast_db'),
         # CACHE_DIR=os.path.join(app.instance_path, 'cache'),
-        # CACHE_TYPE="filesystem"
+        # CACHE_TYPE="filesystem",
+        SQLALCHEMY_DATABASE_URI="sqlite:///%s" % os.path.join(app.instance_path, 'sequencer.sqlite'),
+        SQLALCHEMY_TRACK_MODIFICATIONS=False
     )
     app.config.from_pyfile('application.cfg', silent=True)
 
@@ -43,7 +45,8 @@ def create_app(test_config=None):
 
     # inject database management
     from . import db
-    db.init_app(app)
+    db.db.init_app(app)
+    app.cli.add_command(db.init_db_command)
 
     # inject api endpoints
     from . import api

diff --git a/sequencer/api.py b/sequencer/api.py
@@ -16,7 +16,8 @@
 from sequencer import cache
 from sequencer.default_settings import MOCK_BLAST_HAS_RESULTS, USE_GIS_CACHING, BLAST_TIMEOUT
 from sequencer.support import ev3_reader
-from sequencer.db import get_db
+from sequencer.db import db
+from sequencer.models import Sequence, TaxonNames
 from sequencer.support.blaster import blast_sequence, blast_sequence_local
 from sequencer.support.ev3_reader import query_full_sequence
 
@@ -66,8 +67,13 @@ def nudge(direction, amount):
 def query_ev3():
     username = request.args.get('username', 'anonymous')
 
+    def save_to_db(uname, sequence):
+        candidate = Sequence(username=uname, sequence=sequence)
+        db.session.add(candidate)
+        db.session.commit()
+        return candidate.id
+
     def g():
-        db = get_db()
         readings = []
 
         yield "["  # delimiters are for whoever's waiting for a full sequence
@@ -81,19 +87,17 @@ def g():
             readings.append(row)
             yield json.dumps(row)
 
+        if not_first:
+            yield ','
+        yield json.dumps({'query_id': save_to_db(username, json.dumps(readings))})
         yield "]"
 
-        db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(readings),))
-        db.commit()
-
     try:
         if request.args.get('streaming') == 'true':
             return Response(stream_with_context(g()))
         else:
-            o_db = get_db()
             payload = list(query_full_sequence())
-            o_db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(payload),))
-            o_db.commit()
+            payload.append({'query_id': save_to_db(username, json.dumps(payload))})
 
             return jsonify(payload)
 
@@ -178,14 +182,50 @@ def species_img():
         })
         result = resp.json()
 
-        cached_val = [x['link'] for x in result['items']]
+        try:
+            cached_val = [x['link'] for x in result['items']]
+            cache.set(cache_key, cached_val)
+        except KeyError:
+            # if we don't have any items, don't even bother udpating the cache and just return nothing
+            cached_val = []
+
+            return jsonify({'results': []})
+
         cache.set(cache_key, cached_val)
 
     return jsonify({
         'results': cached_val
     })
 
 
+# ------------------------------------------------------
+# --- taxonomic information
+# ------------------------------------------------------
+
+@bp.route('/taxonomy/<tax_id>')
+def ancestry(tax_id):
+    results = db.engine.execute("""
+    with recursive ancestry(tax_id, rank, name) as (
+        select taxons.tax_id, taxons.rank, taxon_names.name from taxons
+        inner join taxon_names on taxon_names.tax_id=taxons.tax_id
+        where taxons.tax_id=:tax_id
+    
+        union all
+    
+        select T.parent_tax_id, T.rank, TN.name
+        from taxons T, ancestry
+        inner join taxon_names TN on TN.tax_id=T.tax_id
+        where T.tax_id=ancestry.tax_id and T.tax_id != T.parent_tax_id
+    ) select * from ancestry;
+    """, {'tax_id': tax_id})
+
+    columns = [x.name for x in TaxonNames.__table__.columns]
+
+    return jsonify({
+        'ancestry': list(dict(zip(columns, x)) for x in results)
+    })
+
+
 # ------------------------------------------------------
 # --- mocked endpoints
 # ------------------------------------------------------

diff --git a/sequencer/db.py b/sequencer/db.py
@@ -1,43 +1,22 @@
-import sqlite3
-
 import click
-from flask import current_app, g
+from flask import current_app
 from flask.cli import with_appcontext
+from flask_sqlalchemy import SQLAlchemy
 
+from sequencer.default_settings import LOAD_TAXON_DATA
 
-def get_db():
-    if 'db' not in g:
-        g.db = sqlite3.connect(
-            current_app.config['DATABASE'],
-            detect_types=sqlite3.PARSE_DECLTYPES
-        )
-        g.db.row_factory = sqlite3.Row
-
-    return g.db
-
-
-def close_db(e=None):
-    db = g.pop('db', None)
-
-    if db is not None:
-        db.close()
-
-
-def init_db():
-    db = get_db()
-
-    with current_app.open_resource('schema.sql') as f:
-        db.executescript(f.read().decode('utf8'))
+db = SQLAlchemy()
 
 
 @click.command('init-db')
 @with_appcontext
 def init_db_command():
     """Clear the existing data and create new tables."""
-    init_db()
+    import sequencer.models
+    db.init_app(current_app)
+    db.create_all()
     click.echo('Initialized the database.')
 
-
-def init_app(app):
-    app.teardown_appcontext(close_db)
-    app.cli.add_command(init_db_command)
+    if LOAD_TAXON_DATA:
+        sequencer.models.load_taxons(click.echo)
+        click.echo('Loaded taxons from dump.')
diff --git a/sequencer/default_settings.py b/sequencer/default_settings.py
@@ -4,7 +4,7 @@
 
 # if true, mocks communication with the LEGO brick, producing sequences from real organisms
 # (see support.ev3_reader.SAMPLE_SEQUENCES for details)
-MOCK_COMM = False
+MOCK_COMM = True
 
 # if MOCK_COMM=True, the mock process duration is multiplied by this value (>1.0 slower, <1.0 faster)
 TIME_MOD = 0.1
@@ -21,13 +21,22 @@
 # time in seconds before the BLAST request times out, never if None
 BLAST_TIMEOUT = 120
 
+# minimum time in seconds between re-checking polls, since BLAST will occasionally return tiny values like 2 seconds
+# set to 0 to disable the minimum
+MIN_POLL_DELAY_SECS = 0
+
 # if true, mocks the NCBI blast request process and returns a canned result
 MOCK_BLAST = False
 
 # if MOCK_BLAST and MOCK_BLAST_HAS_RESULTS are true, returns a canned file with hits;
 # otherwise, returns a canned file with no hits
 MOCK_BLAST_HAS_RESULTS = True
 
+# if true, expects taxon data from the taxdump file (obtained from ftp://ftp.ncbi.nih.gov/pub/taxonomy/) to be
+# extracted to a folder named ./data, and loads taxon info from those files into Taxon and TaxonName.
+# the taxon data is currently unused, so it's recommended to leave this flag as false.
+LOAD_TAXON_DATA = False
+
 
 # -----------------------------------
 # --- Caching

diff --git a/sequencer/models.py b/sequencer/models.py
@@ -0,0 +1,148 @@
+import datetime
+
+from tqdm import tqdm
+
+from sequencer.db import db
+
+
+# --------------------------------------------------------------------------------
+# --- sequence query logging
+# --------------------------------------------------------------------------------
+
+class Sequence(db.Model):
+    """
+    Saves searched sequences along with the user who saved it.
+
+    TODO: In the future, we'll also save the BLAST results, but for now it's always null.
+    """
+
+    __tablename__ = 'sequences'
+    id = db.Column(db.Integer, primary_key=True)
+    created = db.Column(db.TIMESTAMP, default=datetime.datetime.now())
+    username = db.Column(db.Text)
+    sequence = db.Column(db.Text, nullable=False)
+    results = db.Column(db.Text)
+
+    def __init__(self, username=None, sequence=None):
+        self.username = username
+        self.sequence = sequence
+
+    def __repr__(self):
+        return '<Sequence %s>' % self.sequence
+
+
+# --------------------------------------------------------------------------------
+# --- taxonomic info models
+# --------------------------------------------------------------------------------
+
+class Taxon(db.Model):
+    """
+    Table of taxonomic data, from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ (specifically the nodes.dmp file). Note that this
+    table includes all levels of the taxonomic tree; the organizational level is stored in the 'rank' field.
+
+    TODO: In the future, we'll use this to allow ordering/filtering of organism results by their taxonomic relationships,
+    e.g. prioritizing "fuzzy things", filtering out viruses/bacteria, etc.
+    """
+    __tablename__ = 'taxons'
+    tax_id = db.Column(db.BigInteger, primary_key=True)  # node id in GenBank taxonomy database
+    parent_tax_id = db.Column(db.BigInteger, db.ForeignKey('taxons.tax_id'))  # parent node id in GenBank taxonomy database
+    rank = db.Column(db.Text)  # rank of this node (superkingdom, kingdom, ...)
+    embl_code = db.Column(db.Text)  # locus-name prefix; not unique
+    division_id = db.Column(db.Text)  # see division.dmp file
+    inherited_div_flag = db.Column(db.Boolean)  # 1 if node inherits division from parent
+    genetic_code_id = db.Column(db.Text)  # see gencode.dmp file
+    inherited_GC__flag = db.Column(db.Boolean)  # 1 if node inherits genetic code from parent
+    mitochondrial_genetic_code_id = db.Column(db.BigInteger)  # see gencode.dmp file
+    inherited_MGC_flag = db.Column(db.Boolean)  # 1 if node inherits mitochondrial gencode from parent
+    GenBank_hidden_flag = db.Column(db.Boolean)  # 1 if name is suppressed in GenBank entry lineage
+    hidden_subtree_root_flag = db.Column(db.Boolean)  # 1 if this subtree has no sequence data yet
+    comments = db.Column(db.Text)  # free-text comments and citations
+
+
+class TaxonNames(db.Model):
+    """
+    Like the Taxon model, contains information about organisms based on a tax_id returned from NCBI's services.
+
+    This table stores scientific and, in some cases, common names of organisms for a given tax_id.
+    """
+    __tablename__ = 'taxon_names'
+    tax_id = db.Column(db.BigInteger, primary_key=True)  # the id of node associated with this name
+    name = db.Column(db.Text)  # name itself
+    unique_name = db.Column(db.Text)  # the unique variant of this name if name not unique
+    name_class = db.Column(db.Text)  # (synonym, common name, ...)
+
+
+# ---
+# --- helper routines for loading taxon data
+# ---
+
+def _load_table(table_obj, source_file, columns, filter_expr=None, transform_expr=None, recs_before_commit=5000):
+    """
+    Loads a columnar file's rows as a series of table_obj instances.
+    :param table_obj: the model instance to load
+    :param source_file: the name of the column-based file
+    :param columns: the columns to load from the file into the model (field names and column names must match)
+    :param filter_expr: a function applied to each row that, if returns falsey, will exclude that row
+    :param transform_expr: a function applied to each row that can transform values in each cell of the row
+    :param recs_before_commit: number of records to process before issuing a commit
+    :return: None
+    """
+
+    # first, clear out any existing contents
+    db.session.query(table_obj).delete()
+
+    with open(source_file) as fp:
+        commit_counter = recs_before_commit
+        for row in tqdm(fp.readlines()):
+            cells = dict(zip(columns, (x.strip() for x in row.split("|"))))
+
+            # discards rows that don't pass the filter expression
+            if filter_expr is not None and not filter_expr(cells):
+                continue
+
+            # transforms column values for this row in-place
+            if transform_expr is not None:
+                transform_expr(cells)
+
+            db.session.add(table_obj(**cells))
+
+            if commit_counter > 0:
+                commit_counter -= 1
+            else:
+                commit_counter = recs_before_commit
+                db.session.commit()
+
+        db.session.commit()
+
+
+def load_taxons(echoer):
+    echoer("Loading taxon names...")
+    _load_table(TaxonNames, './data/names.dmp', [
+        'tax_id',
+        'name',
+        'unique_name',
+        'name_class'
+    ], filter_expr=lambda x: x['name_class'] == 'scientific name')
+
+    def transform_bools(cells):
+        for k in cells:
+            if k.endswith('_flag'):
+                cells[k] = bool(int(cells[k]))
+        return cells
+
+    echoer("Loading taxons...")
+    _load_table(Taxon, './data/nodes.dmp', [
+        'tax_id',
+        'parent_tax_id',
+        'rank',
+        'embl_code',
+        'division_id',
+        'inherited_div_flag',
+        'genetic_code_id',
+        'inherited_GC__flag',
+        'mitochondrial_genetic_code_id',
+        'inherited_MGC_flag',
+        'GenBank_hidden_flag',
+        'hidden_subtree_root_flag',
+        'comments',
+    ], transform_expr=transform_bools)
diff --git a/sequencer/schema.sql b/sequencer/schema.sql