Skip to content
This repository was archived by the owner on Jan 10, 2023. It is now read-only.

Commit

Permalink
Added flask-slqa, declarative models to the project. Added taxon tabl…
Browse files Browse the repository at this point in the history
…es and instructions on how to load that data.
  • Loading branch information
falquaddoomi committed Sep 23, 2019
1 parent 0c50f0a commit 4621881
Show file tree
Hide file tree
Showing 10 changed files with 311 additions and 103 deletions.
4 changes: 4 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.dump
*.dmp
*.prt
readme.txt
9 changes: 9 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Taxon Data

Download taxdump.zip/.tar.gz from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ and extract it to this folder. Only nodes.dmp and
names.dmp are currently used.

If sequencer.default_settings.LOAD_TAXON_DATA is true, this data will be loaded into taxonomy tables in the app.
(Note that, as of this writing, these tables are unused, so this is an entirely optional step.)

Refer to readme.txt (supplied in the zip) for more information about these files.
9 changes: 6 additions & 3 deletions sequencer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ def create_app(test_config=None):
app.config.from_object('sequencer.default_settings')
app.config.from_mapping(
SECRET_KEY='dev',
DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'),
# DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'),
BLAST_DB_DIR=os.path.join(app.root_path, 'blast_db'),
# CACHE_DIR=os.path.join(app.instance_path, 'cache'),
# CACHE_TYPE="filesystem"
# CACHE_TYPE="filesystem",
SQLALCHEMY_DATABASE_URI="sqlite:///%s" % os.path.join(app.instance_path, 'sequencer.sqlite'),
SQLALCHEMY_TRACK_MODIFICATIONS=False
)
app.config.from_pyfile('application.cfg', silent=True)

Expand All @@ -43,7 +45,8 @@ def create_app(test_config=None):

# inject database management
from . import db
db.init_app(app)
db.db.init_app(app)
app.cli.add_command(db.init_db_command)

# inject api endpoints
from . import api
Expand Down
58 changes: 49 additions & 9 deletions sequencer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from sequencer import cache
from sequencer.default_settings import MOCK_BLAST_HAS_RESULTS, USE_GIS_CACHING, BLAST_TIMEOUT
from sequencer.support import ev3_reader
from sequencer.db import get_db
from sequencer.db import db
from sequencer.models import Sequence, TaxonNames
from sequencer.support.blaster import blast_sequence, blast_sequence_local
from sequencer.support.ev3_reader import query_full_sequence

Expand Down Expand Up @@ -66,8 +67,13 @@ def nudge(direction, amount):
def query_ev3():
username = request.args.get('username', 'anonymous')

def save_to_db(uname, sequence):
candidate = Sequence(username=uname, sequence=sequence)
db.session.add(candidate)
db.session.commit()
return candidate.id

def g():
db = get_db()
readings = []

yield "[" # delimiters are for whoever's waiting for a full sequence
Expand All @@ -81,19 +87,17 @@ def g():
readings.append(row)
yield json.dumps(row)

if not_first:
yield ','
yield json.dumps({'query_id': save_to_db(username, json.dumps(readings))})
yield "]"

db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(readings),))
db.commit()

try:
if request.args.get('streaming') == 'true':
return Response(stream_with_context(g()))
else:
o_db = get_db()
payload = list(query_full_sequence())
o_db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(payload),))
o_db.commit()
payload.append({'query_id': save_to_db(username, json.dumps(payload))})

return jsonify(payload)

Expand Down Expand Up @@ -178,14 +182,50 @@ def species_img():
})
result = resp.json()

cached_val = [x['link'] for x in result['items']]
try:
cached_val = [x['link'] for x in result['items']]
cache.set(cache_key, cached_val)
except KeyError:
# if we don't have any items, don't even bother udpating the cache and just return nothing
cached_val = []

return jsonify({'results': []})

cache.set(cache_key, cached_val)

return jsonify({
'results': cached_val
})


# ------------------------------------------------------
# --- taxonomic information
# ------------------------------------------------------

@bp.route('/taxonomy/<tax_id>')
def ancestry(tax_id):
results = db.engine.execute("""
with recursive ancestry(tax_id, rank, name) as (
select taxons.tax_id, taxons.rank, taxon_names.name from taxons
inner join taxon_names on taxon_names.tax_id=taxons.tax_id
where taxons.tax_id=:tax_id
union all
select T.parent_tax_id, T.rank, TN.name
from taxons T, ancestry
inner join taxon_names TN on TN.tax_id=T.tax_id
where T.tax_id=ancestry.tax_id and T.tax_id != T.parent_tax_id
) select * from ancestry;
""", {'tax_id': tax_id})

columns = [x.name for x in TaxonNames.__table__.columns]

return jsonify({
'ancestry': list(dict(zip(columns, x)) for x in results)
})


# ------------------------------------------------------
# --- mocked endpoints
# ------------------------------------------------------
Expand Down
41 changes: 10 additions & 31 deletions sequencer/db.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,22 @@
import sqlite3

import click
from flask import current_app, g
from flask import current_app
from flask.cli import with_appcontext
from flask_sqlalchemy import SQLAlchemy

from sequencer.default_settings import LOAD_TAXON_DATA

def get_db():
if 'db' not in g:
g.db = sqlite3.connect(
current_app.config['DATABASE'],
detect_types=sqlite3.PARSE_DECLTYPES
)
g.db.row_factory = sqlite3.Row

return g.db


def close_db(e=None):
db = g.pop('db', None)

if db is not None:
db.close()


def init_db():
db = get_db()

with current_app.open_resource('schema.sql') as f:
db.executescript(f.read().decode('utf8'))
db = SQLAlchemy()


@click.command('init-db')
@with_appcontext
def init_db_command():
"""Clear the existing data and create new tables."""
init_db()
import sequencer.models
db.init_app(current_app)
db.create_all()
click.echo('Initialized the database.')


def init_app(app):
app.teardown_appcontext(close_db)
app.cli.add_command(init_db_command)
if LOAD_TAXON_DATA:
sequencer.models.load_taxons(click.echo)
click.echo('Loaded taxons from dump.')
11 changes: 10 additions & 1 deletion sequencer/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# if true, mocks communication with the LEGO brick, producing sequences from real organisms
# (see support.ev3_reader.SAMPLE_SEQUENCES for details)
MOCK_COMM = False
MOCK_COMM = True

# if MOCK_COMM=True, the mock process duration is multiplied by this value (>1.0 slower, <1.0 faster)
TIME_MOD = 0.1
Expand All @@ -21,13 +21,22 @@
# time in seconds before the BLAST request times out, never if None
BLAST_TIMEOUT = 120

# minimum time in seconds between re-checking polls, since BLAST will occasionally return tiny values like 2 seconds
# set to 0 to disable the minimum
MIN_POLL_DELAY_SECS = 0

# if true, mocks the NCBI blast request process and returns a canned result
MOCK_BLAST = False

# if MOCK_BLAST and MOCK_BLAST_HAS_RESULTS are true, returns a canned file with hits;
# otherwise, returns a canned file with no hits
MOCK_BLAST_HAS_RESULTS = True

# if true, expects taxon data from the taxdump file (obtained from ftp://ftp.ncbi.nih.gov/pub/taxonomy/) to be
# extracted to a folder named ./data, and loads taxon info from those files into Taxon and TaxonName.
# the taxon data is currently unused, so it's recommended to leave this flag as false.
LOAD_TAXON_DATA = False


# -----------------------------------
# --- Caching
Expand Down
148 changes: 148 additions & 0 deletions sequencer/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import datetime

from tqdm import tqdm

from sequencer.db import db


# --------------------------------------------------------------------------------
# --- sequence query logging
# --------------------------------------------------------------------------------

class Sequence(db.Model):
"""
Saves searched sequences along with the user who saved it.
TODO: In the future, we'll also save the BLAST results, but for now it's always null.
"""

__tablename__ = 'sequences'
id = db.Column(db.Integer, primary_key=True)
created = db.Column(db.TIMESTAMP, default=datetime.datetime.now())
username = db.Column(db.Text)
sequence = db.Column(db.Text, nullable=False)
results = db.Column(db.Text)

def __init__(self, username=None, sequence=None):
self.username = username
self.sequence = sequence

def __repr__(self):
return '<Sequence %s>' % self.sequence


# --------------------------------------------------------------------------------
# --- taxonomic info models
# --------------------------------------------------------------------------------

class Taxon(db.Model):
"""
Table of taxonomic data, from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ (specifically the nodes.dmp file). Note that this
table includes all levels of the taxonomic tree; the organizational level is stored in the 'rank' field.
TODO: In the future, we'll use this to allow ordering/filtering of organism results by their taxonomic relationships,
e.g. prioritizing "fuzzy things", filtering out viruses/bacteria, etc.
"""
__tablename__ = 'taxons'
tax_id = db.Column(db.BigInteger, primary_key=True) # node id in GenBank taxonomy database
parent_tax_id = db.Column(db.BigInteger, db.ForeignKey('taxons.tax_id')) # parent node id in GenBank taxonomy database
rank = db.Column(db.Text) # rank of this node (superkingdom, kingdom, ...)
embl_code = db.Column(db.Text) # locus-name prefix; not unique
division_id = db.Column(db.Text) # see division.dmp file
inherited_div_flag = db.Column(db.Boolean) # 1 if node inherits division from parent
genetic_code_id = db.Column(db.Text) # see gencode.dmp file
inherited_GC__flag = db.Column(db.Boolean) # 1 if node inherits genetic code from parent
mitochondrial_genetic_code_id = db.Column(db.BigInteger) # see gencode.dmp file
inherited_MGC_flag = db.Column(db.Boolean) # 1 if node inherits mitochondrial gencode from parent
GenBank_hidden_flag = db.Column(db.Boolean) # 1 if name is suppressed in GenBank entry lineage
hidden_subtree_root_flag = db.Column(db.Boolean) # 1 if this subtree has no sequence data yet
comments = db.Column(db.Text) # free-text comments and citations


class TaxonNames(db.Model):
"""
Like the Taxon model, contains information about organisms based on a tax_id returned from NCBI's services.
This table stores scientific and, in some cases, common names of organisms for a given tax_id.
"""
__tablename__ = 'taxon_names'
tax_id = db.Column(db.BigInteger, primary_key=True) # the id of node associated with this name
name = db.Column(db.Text) # name itself
unique_name = db.Column(db.Text) # the unique variant of this name if name not unique
name_class = db.Column(db.Text) # (synonym, common name, ...)


# ---
# --- helper routines for loading taxon data
# ---

def _load_table(table_obj, source_file, columns, filter_expr=None, transform_expr=None, recs_before_commit=5000):
"""
Loads a columnar file's rows as a series of table_obj instances.
:param table_obj: the model instance to load
:param source_file: the name of the column-based file
:param columns: the columns to load from the file into the model (field names and column names must match)
:param filter_expr: a function applied to each row that, if returns falsey, will exclude that row
:param transform_expr: a function applied to each row that can transform values in each cell of the row
:param recs_before_commit: number of records to process before issuing a commit
:return: None
"""

# first, clear out any existing contents
db.session.query(table_obj).delete()

with open(source_file) as fp:
commit_counter = recs_before_commit
for row in tqdm(fp.readlines()):
cells = dict(zip(columns, (x.strip() for x in row.split("|"))))

# discards rows that don't pass the filter expression
if filter_expr is not None and not filter_expr(cells):
continue

# transforms column values for this row in-place
if transform_expr is not None:
transform_expr(cells)

db.session.add(table_obj(**cells))

if commit_counter > 0:
commit_counter -= 1
else:
commit_counter = recs_before_commit
db.session.commit()

db.session.commit()


def load_taxons(echoer):
echoer("Loading taxon names...")
_load_table(TaxonNames, './data/names.dmp', [
'tax_id',
'name',
'unique_name',
'name_class'
], filter_expr=lambda x: x['name_class'] == 'scientific name')

def transform_bools(cells):
for k in cells:
if k.endswith('_flag'):
cells[k] = bool(int(cells[k]))
return cells

echoer("Loading taxons...")
_load_table(Taxon, './data/nodes.dmp', [
'tax_id',
'parent_tax_id',
'rank',
'embl_code',
'division_id',
'inherited_div_flag',
'genetic_code_id',
'inherited_GC__flag',
'mitochondrial_genetic_code_id',
'inherited_MGC_flag',
'GenBank_hidden_flag',
'hidden_subtree_root_flag',
'comments',
], transform_expr=transform_bools)
9 changes: 0 additions & 9 deletions sequencer/schema.sql

This file was deleted.

Loading

0 comments on commit 4621881

Please sign in to comment.