diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ab83004 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,4 @@ +*.dump +*.dmp +*.prt +readme.txt diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..35a4546 --- /dev/null +++ b/data/README.md @@ -0,0 +1,9 @@ +# Taxon Data + +Download taxdump.zip/.tar.gz from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ and extract it to this folder. Only nodes.dmp and +names.dmp are currently used. + +If sequencer.default_settings.LOAD_TAXON_DATA is true, this data will be loaded into taxonomy tables in the app. +(Note that, as of this writing, these tables are unused, so this is an entirely optional step.) + +Refer to readme.txt (supplied in the zip) for more information about these files. diff --git a/sequencer/__init__.py b/sequencer/__init__.py index 8aaf441..60e39fc 100644 --- a/sequencer/__init__.py +++ b/sequencer/__init__.py @@ -13,10 +13,12 @@ def create_app(test_config=None): app.config.from_object('sequencer.default_settings') app.config.from_mapping( SECRET_KEY='dev', - DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'), + # DATABASE=os.path.join(app.instance_path, 'sequencer.sqlite'), BLAST_DB_DIR=os.path.join(app.root_path, 'blast_db'), # CACHE_DIR=os.path.join(app.instance_path, 'cache'), - # CACHE_TYPE="filesystem" + # CACHE_TYPE="filesystem", + SQLALCHEMY_DATABASE_URI="sqlite:///%s" % os.path.join(app.instance_path, 'sequencer.sqlite'), + SQLALCHEMY_TRACK_MODIFICATIONS=False ) app.config.from_pyfile('application.cfg', silent=True) @@ -43,7 +45,8 @@ def create_app(test_config=None): # inject database management from . import db - db.init_app(app) + db.db.init_app(app) + app.cli.add_command(db.init_db_command) # inject api endpoints from . import api diff --git a/sequencer/api.py b/sequencer/api.py index 9478f9d..dd223c3 100644 --- a/sequencer/api.py +++ b/sequencer/api.py @@ -16,7 +16,8 @@ from sequencer import cache from sequencer.default_settings import MOCK_BLAST_HAS_RESULTS, USE_GIS_CACHING, BLAST_TIMEOUT from sequencer.support import ev3_reader -from sequencer.db import get_db +from sequencer.db import db +from sequencer.models import Sequence, TaxonNames from sequencer.support.blaster import blast_sequence, blast_sequence_local from sequencer.support.ev3_reader import query_full_sequence @@ -66,8 +67,13 @@ def nudge(direction, amount): def query_ev3(): username = request.args.get('username', 'anonymous') + def save_to_db(uname, sequence): + candidate = Sequence(username=uname, sequence=sequence) + db.session.add(candidate) + db.session.commit() + return candidate.id + def g(): - db = get_db() readings = [] yield "[" # delimiters are for whoever's waiting for a full sequence @@ -81,19 +87,17 @@ def g(): readings.append(row) yield json.dumps(row) + if not_first: + yield ',' + yield json.dumps({'query_id': save_to_db(username, json.dumps(readings))}) yield "]" - db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(readings),)) - db.commit() - try: if request.args.get('streaming') == 'true': return Response(stream_with_context(g())) else: - o_db = get_db() payload = list(query_full_sequence()) - o_db.execute('insert into sequences (username, sequence) values (?, ?)', (username, json.dumps(payload),)) - o_db.commit() + payload.append({'query_id': save_to_db(username, json.dumps(payload))}) return jsonify(payload) @@ -178,7 +182,15 @@ def species_img(): }) result = resp.json() - cached_val = [x['link'] for x in result['items']] + try: + cached_val = [x['link'] for x in result['items']] + cache.set(cache_key, cached_val) + except KeyError: + # if we don't have any items, don't even bother udpating the cache and just return nothing + cached_val = [] + + return jsonify({'results': []}) + cache.set(cache_key, cached_val) return jsonify({ @@ -186,6 +198,34 @@ def species_img(): }) +# ------------------------------------------------------ +# --- taxonomic information +# ------------------------------------------------------ + +@bp.route('/taxonomy/') +def ancestry(tax_id): + results = db.engine.execute(""" + with recursive ancestry(tax_id, rank, name) as ( + select taxons.tax_id, taxons.rank, taxon_names.name from taxons + inner join taxon_names on taxon_names.tax_id=taxons.tax_id + where taxons.tax_id=:tax_id + + union all + + select T.parent_tax_id, T.rank, TN.name + from taxons T, ancestry + inner join taxon_names TN on TN.tax_id=T.tax_id + where T.tax_id=ancestry.tax_id and T.tax_id != T.parent_tax_id + ) select * from ancestry; + """, {'tax_id': tax_id}) + + columns = [x.name for x in TaxonNames.__table__.columns] + + return jsonify({ + 'ancestry': list(dict(zip(columns, x)) for x in results) + }) + + # ------------------------------------------------------ # --- mocked endpoints # ------------------------------------------------------ diff --git a/sequencer/db.py b/sequencer/db.py index ef6e138..0162095 100644 --- a/sequencer/db.py +++ b/sequencer/db.py @@ -1,43 +1,22 @@ -import sqlite3 - import click -from flask import current_app, g +from flask import current_app from flask.cli import with_appcontext +from flask_sqlalchemy import SQLAlchemy +from sequencer.default_settings import LOAD_TAXON_DATA -def get_db(): - if 'db' not in g: - g.db = sqlite3.connect( - current_app.config['DATABASE'], - detect_types=sqlite3.PARSE_DECLTYPES - ) - g.db.row_factory = sqlite3.Row - - return g.db - - -def close_db(e=None): - db = g.pop('db', None) - - if db is not None: - db.close() - - -def init_db(): - db = get_db() - - with current_app.open_resource('schema.sql') as f: - db.executescript(f.read().decode('utf8')) +db = SQLAlchemy() @click.command('init-db') @with_appcontext def init_db_command(): """Clear the existing data and create new tables.""" - init_db() + import sequencer.models + db.init_app(current_app) + db.create_all() click.echo('Initialized the database.') - -def init_app(app): - app.teardown_appcontext(close_db) - app.cli.add_command(init_db_command) + if LOAD_TAXON_DATA: + sequencer.models.load_taxons(click.echo) + click.echo('Loaded taxons from dump.') diff --git a/sequencer/default_settings.py b/sequencer/default_settings.py index ce724af..810e1bd 100644 --- a/sequencer/default_settings.py +++ b/sequencer/default_settings.py @@ -4,7 +4,7 @@ # if true, mocks communication with the LEGO brick, producing sequences from real organisms # (see support.ev3_reader.SAMPLE_SEQUENCES for details) -MOCK_COMM = False +MOCK_COMM = True # if MOCK_COMM=True, the mock process duration is multiplied by this value (>1.0 slower, <1.0 faster) TIME_MOD = 0.1 @@ -21,6 +21,10 @@ # time in seconds before the BLAST request times out, never if None BLAST_TIMEOUT = 120 +# minimum time in seconds between re-checking polls, since BLAST will occasionally return tiny values like 2 seconds +# set to 0 to disable the minimum +MIN_POLL_DELAY_SECS = 0 + # if true, mocks the NCBI blast request process and returns a canned result MOCK_BLAST = False @@ -28,6 +32,11 @@ # otherwise, returns a canned file with no hits MOCK_BLAST_HAS_RESULTS = True +# if true, expects taxon data from the taxdump file (obtained from ftp://ftp.ncbi.nih.gov/pub/taxonomy/) to be +# extracted to a folder named ./data, and loads taxon info from those files into Taxon and TaxonName. +# the taxon data is currently unused, so it's recommended to leave this flag as false. +LOAD_TAXON_DATA = False + # ----------------------------------- # --- Caching diff --git a/sequencer/models.py b/sequencer/models.py new file mode 100644 index 0000000..21f2abe --- /dev/null +++ b/sequencer/models.py @@ -0,0 +1,148 @@ +import datetime + +from tqdm import tqdm + +from sequencer.db import db + + +# -------------------------------------------------------------------------------- +# --- sequence query logging +# -------------------------------------------------------------------------------- + +class Sequence(db.Model): + """ + Saves searched sequences along with the user who saved it. + + TODO: In the future, we'll also save the BLAST results, but for now it's always null. + """ + + __tablename__ = 'sequences' + id = db.Column(db.Integer, primary_key=True) + created = db.Column(db.TIMESTAMP, default=datetime.datetime.now()) + username = db.Column(db.Text) + sequence = db.Column(db.Text, nullable=False) + results = db.Column(db.Text) + + def __init__(self, username=None, sequence=None): + self.username = username + self.sequence = sequence + + def __repr__(self): + return '' % self.sequence + + +# -------------------------------------------------------------------------------- +# --- taxonomic info models +# -------------------------------------------------------------------------------- + +class Taxon(db.Model): + """ + Table of taxonomic data, from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ (specifically the nodes.dmp file). Note that this + table includes all levels of the taxonomic tree; the organizational level is stored in the 'rank' field. + + TODO: In the future, we'll use this to allow ordering/filtering of organism results by their taxonomic relationships, + e.g. prioritizing "fuzzy things", filtering out viruses/bacteria, etc. + """ + __tablename__ = 'taxons' + tax_id = db.Column(db.BigInteger, primary_key=True) # node id in GenBank taxonomy database + parent_tax_id = db.Column(db.BigInteger, db.ForeignKey('taxons.tax_id')) # parent node id in GenBank taxonomy database + rank = db.Column(db.Text) # rank of this node (superkingdom, kingdom, ...) + embl_code = db.Column(db.Text) # locus-name prefix; not unique + division_id = db.Column(db.Text) # see division.dmp file + inherited_div_flag = db.Column(db.Boolean) # 1 if node inherits division from parent + genetic_code_id = db.Column(db.Text) # see gencode.dmp file + inherited_GC__flag = db.Column(db.Boolean) # 1 if node inherits genetic code from parent + mitochondrial_genetic_code_id = db.Column(db.BigInteger) # see gencode.dmp file + inherited_MGC_flag = db.Column(db.Boolean) # 1 if node inherits mitochondrial gencode from parent + GenBank_hidden_flag = db.Column(db.Boolean) # 1 if name is suppressed in GenBank entry lineage + hidden_subtree_root_flag = db.Column(db.Boolean) # 1 if this subtree has no sequence data yet + comments = db.Column(db.Text) # free-text comments and citations + + +class TaxonNames(db.Model): + """ + Like the Taxon model, contains information about organisms based on a tax_id returned from NCBI's services. + + This table stores scientific and, in some cases, common names of organisms for a given tax_id. + """ + __tablename__ = 'taxon_names' + tax_id = db.Column(db.BigInteger, primary_key=True) # the id of node associated with this name + name = db.Column(db.Text) # name itself + unique_name = db.Column(db.Text) # the unique variant of this name if name not unique + name_class = db.Column(db.Text) # (synonym, common name, ...) + + +# --- +# --- helper routines for loading taxon data +# --- + +def _load_table(table_obj, source_file, columns, filter_expr=None, transform_expr=None, recs_before_commit=5000): + """ + Loads a columnar file's rows as a series of table_obj instances. + :param table_obj: the model instance to load + :param source_file: the name of the column-based file + :param columns: the columns to load from the file into the model (field names and column names must match) + :param filter_expr: a function applied to each row that, if returns falsey, will exclude that row + :param transform_expr: a function applied to each row that can transform values in each cell of the row + :param recs_before_commit: number of records to process before issuing a commit + :return: None + """ + + # first, clear out any existing contents + db.session.query(table_obj).delete() + + with open(source_file) as fp: + commit_counter = recs_before_commit + for row in tqdm(fp.readlines()): + cells = dict(zip(columns, (x.strip() for x in row.split("|")))) + + # discards rows that don't pass the filter expression + if filter_expr is not None and not filter_expr(cells): + continue + + # transforms column values for this row in-place + if transform_expr is not None: + transform_expr(cells) + + db.session.add(table_obj(**cells)) + + if commit_counter > 0: + commit_counter -= 1 + else: + commit_counter = recs_before_commit + db.session.commit() + + db.session.commit() + + +def load_taxons(echoer): + echoer("Loading taxon names...") + _load_table(TaxonNames, './data/names.dmp', [ + 'tax_id', + 'name', + 'unique_name', + 'name_class' + ], filter_expr=lambda x: x['name_class'] == 'scientific name') + + def transform_bools(cells): + for k in cells: + if k.endswith('_flag'): + cells[k] = bool(int(cells[k])) + return cells + + echoer("Loading taxons...") + _load_table(Taxon, './data/nodes.dmp', [ + 'tax_id', + 'parent_tax_id', + 'rank', + 'embl_code', + 'division_id', + 'inherited_div_flag', + 'genetic_code_id', + 'inherited_GC__flag', + 'mitochondrial_genetic_code_id', + 'inherited_MGC_flag', + 'GenBank_hidden_flag', + 'hidden_subtree_root_flag', + 'comments', + ], transform_expr=transform_bools) diff --git a/sequencer/schema.sql b/sequencer/schema.sql deleted file mode 100644 index 2892bf4..0000000 --- a/sequencer/schema.sql +++ /dev/null @@ -1,9 +0,0 @@ -DROP TABLE IF EXISTS sequences; - -CREATE TABLE sequences ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - username TEXT, - sequence TEXT NOT NULL, - results TEXT -); diff --git a/sequencer/support/blaster.py b/sequencer/support/blaster.py index d3b69ea..510fdc3 100644 --- a/sequencer/support/blaster.py +++ b/sequencer/support/blaster.py @@ -1,10 +1,10 @@ import json +import os import re from io import StringIO from time import sleep, time import requests -import pyblast from sequencer.cache import cache @@ -57,17 +57,20 @@ # # =========================================================================== -from ..default_settings import MOCK_BLAST, USE_BLAST_CACHING +from ..default_settings import MOCK_BLAST, USE_BLAST_CACHING, MIN_POLL_DELAY_SECS BLAST_URL = ( "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi" if not MOCK_BLAST else "http://localhost:5000/api/mock_blast" ) +LOG_STEPS = True +LOG_TEMPLATE = os.path.join('logs', 'seq_%(seq)s_%(id)s-%(step)s.html') + def blast_sequence_local(sequence, blast_db_dir): fp = StringIO(sequence) - pyblast.blastn(fp, db=blast_db_dir) + # pyblast.blastn(fp, db=blast_db_dir) def blast_sequence(sequence, database="nr", program='megablast', timeout=None): @@ -90,6 +93,16 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): yield {'results': cached_val} return + # if it LOG_STEPS is enabled it saves each step in the seq query for posterity + step_idx = 1 + + def log_step(this_resp, step): + nonlocal step_idx + if LOG_STEPS: + with open(LOG_TEMPLATE % {'seq': sequence, 'id': step_idx, 'step': step}, 'wb') as fp: + fp.write(this_resp.content) + step_idx += 1 + # ------------------------------------------------------ # --- step 1. send initial request # ------------------------------------------------------ @@ -111,11 +124,11 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): 'QUERY': sequence, 'WORD_SIZE': '7', - 'EXPECT': '10000', + 'EXPECT': '1000', 'HITLIST_SIZE': '100', 'MATCH_SCORES': '1,-3', 'NUCL_REWARD': '1', - 'NUCL_PENALTY': '-2', + 'NUCL_PENALTY': '-3', 'GAPCOSTS': '5 2', 'FILTER': 'F' @@ -126,6 +139,7 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): # params['MEGABLAST'] = 'on' resp = requests.post(BLAST_URL, data=params) + log_step(resp, 'init') # parse out result id, estimated time to completion result_id_match = re.search(r'^ {4}RID = (.*$)', resp.text, flags=re.MULTILINE) @@ -141,7 +155,10 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): # --- step 2. wait estimated time until we should check for response # ------------------------------------------------------ - yield {'status': "Waiting %d seconds for results for %s to be ready..." % (estimated_completion_secs, result_id)} + yield { + 'status': "Waiting %d seconds for results for %s to be ready..." % (estimated_completion_secs, result_id), + 'job_id': result_id + } sleep(estimated_completion_secs) yield {'status': "...done waiting, checking now."} @@ -153,8 +170,11 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): total_wait = estimated_completion_secs # establish timeout if it was specified end_time = time() + timeout if timeout else None + status_idx = 0 while True: + status_idx += 1 + # if end_time and time() > end_time: # yield {"status": "Timeout of %d seconds reached while waiting for results" % timeout} # return None @@ -165,6 +185,7 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): "FORMAT_OBJECT": "SearchInfo", "RID": result_id }) + log_step(resp, 'status_%d' % status_idx) # parse out the status status_match = re.search(r'\s+Status=([A-Z]+)', resp.text, flags=re.MULTILINE) @@ -176,8 +197,8 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): if status == 'WAITING': # parse out the estimated wait time ("updated in 12 seconds") - parsed_wait_time = result_id_match = re.search(r'updated in (.+) seconds', resp.text, flags=re.MULTILINE) - wait_time = int(parsed_wait_time.group(1)) if parsed_wait_time else 5 + parsed_wait_time = re.search(r'updated in (.+) seconds', resp.text, flags=re.MULTILINE) + wait_time = max(int(parsed_wait_time.group(1)) if parsed_wait_time else 5, MIN_POLL_DELAY_SECS) if total_wait + wait_time > timeout: yield {"status": "Timeout of %d seconds reached while waiting for results" % timeout} @@ -187,11 +208,14 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): total_wait += wait_time sleep(wait_time) continue + elif status == 'UNKNOWN': raise Exception("Search for %s expired, terminating" % result_id) + elif status == 'READY': yield {'status': "Completed! Fetching results..."} break + else: yield {'status': "No hits found"} return None @@ -206,6 +230,7 @@ def blast_sequence(sequence, database="nr", program='megablast', timeout=None): "FORMAT_TYPE": "JSON2_S", "RID": result_id }) + log_step(resp, 'result') # populate the cache with the results and return the result parsed_result = json.loads(resp.text) diff --git a/sequencer/support/ev3_reader.py b/sequencer/support/ev3_reader.py index 5ed3870..3da085d 100644 --- a/sequencer/support/ev3_reader.py +++ b/sequencer/support/ev3_reader.py @@ -82,51 +82,51 @@ def query_sequencer(): m.run_to_rel_pos(position_sp=BRICK_DEG * NUM_BRICKS, speed_sp=900, stop_action="hold") -# SAMPLE_SEQUENCES = [ -# # >ENA|BAA20512|BAA20512.1 Cyprinus carpio (common carp) alpha-globin -# """ -# ATGAGTCTCTCTGATAAGGACAAGGCTGCTGTGAAAGCCCTATGGGCTAAGATCAGCCCC -# AAAGCCGATGATATCGGCGCTGAAGCTCTCGGCAGAATGCTGACCGTCTACCCTCAGACC -# AAGACCTACTTCGCTCACTGGGATGACCTGAGCCCTGGGTCCGGTCCTGTGAAGAAGCAT -# GGCAAGGTTATCATGGGTGCAGTGGCCGATGCCGTTTCAAAAATAGACGACCTTGTGGGA -# GGTCTGGCCTCCCTGAGCGAACTTCATGCTTCCAAGCTGCGTGTTGACCCGGCCAACTTC -# AAGATCCTCGCACACAATGTCATCGTGGTCATCGGCATGCTCTTCCCTGGAGACTTCCCC -# CCAGAGGTTCACATGTCAGTTGACAAGTTTTTCCAGAACTTGGCTCTGGCTCTCTCTGAG -# AAGTACCGCTAA""", -# # >ENA|CAA23748|CAA23748.1 Homo sapiens (human) alpha globin -# """ -# ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCG -# CACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACC -# AAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAAGTTAAGGGCCACGGC -# AAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCG -# CTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAG -# CTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCT -# GCGGTGCACGCTTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAA -# TACCGTTAA""", -# # >ENA|CAA24095|CAA24095.1 Mus musculus (house mouse) alpha-globin -# """ATGGTGCTCTCTGGGGAAGACAAAAGCAACATCAAGGCTGCCTGGGGGAAGATTGGTGGC -# CATGGTGCTGAATATGGAGCTGAAGCCCTGGAAAGGATGTTTGCTAGCTTCCCCACCACC -# AAGACCTACTTTCCTCACTTTGATGTAAGCCACGGCTCTGCCCAGGTCAAGGGTCACGGC -# AAGAAGGTCGCCGATGCGCTGGCCAGTGCTGCAGGCCACCTCGATGACCTGCCCGGTGCC -# TTGTCTGCTCTGAGCGACCTGCATGCCCACAAGCTGCGTGTGGATCCCGTCAACTTCAAG -# CTCCTGAGCCACTGCCTGCTGGTGACCTTGGCTAGCCACCACCCTGCCGATTTCACCCCC -# GCGGTACATGCCTCTCTGGACAAATTCCTTGCCTCTGTGAGCACCGTGCTGACCTCCAAG -# TACCGTTAA""", -# # >ENA|CAA28435|CAA28435.1 Capra hircus (goat) alpha-globin -# """ATGTCTCTGACCAGGACTGAGAGGACCATCATCCTGTCCCTGTGGAGCAAGATCTCCACA -# CAGGCAGACGTCATTGGCACCGAGACCCTGGAGAGGCTCTTCTCCTGCTACCCGCAGGCC -# AAGACCTACTTCCCGCACTTCGACCTGCACTCGGGCTCCGCGCAGCTGCGCGCGCACGGC -# TCCAAGGTGGTGGCCGCCGTGGGCGACGCGGTCAAGAGCATCGACAACGTGACGAGCGCG -# CTGTCCAAGCTGAGCGAGCTGCACGCCTACGTGCTGCGCGTGGACCCGGTCAACTTCAAG -# TTCCTGTCCCACTGCCTGCTGGTCACGTTGGCCTCGCACTTCCCCGCCGACTTCACGGCC -# GACGCGCACGCCGCCTGGGACAAGTTCCTGTCCATCGTGTCCGGCGTCCTGACGGAGAAG -# TACCGCTGA""" -# ] - SAMPLE_SEQUENCES = [ - "ATGAGTCTCTCTGATAAGGACA" + # >ENA|BAA20512|BAA20512.1 Cyprinus carpio (common carp) alpha-globin + """ + ATGAGTCTCTCTGATAAGGACAAGGCTGCTGTGAAAGCCCTATGGGCTAAGATCAGCCCC + AAAGCCGATGATATCGGCGCTGAAGCTCTCGGCAGAATGCTGACCGTCTACCCTCAGACC + AAGACCTACTTCGCTCACTGGGATGACCTGAGCCCTGGGTCCGGTCCTGTGAAGAAGCAT + GGCAAGGTTATCATGGGTGCAGTGGCCGATGCCGTTTCAAAAATAGACGACCTTGTGGGA + GGTCTGGCCTCCCTGAGCGAACTTCATGCTTCCAAGCTGCGTGTTGACCCGGCCAACTTC + AAGATCCTCGCACACAATGTCATCGTGGTCATCGGCATGCTCTTCCCTGGAGACTTCCCC + CCAGAGGTTCACATGTCAGTTGACAAGTTTTTCCAGAACTTGGCTCTGGCTCTCTCTGAG + AAGTACCGCTAA""", + # >ENA|CAA23748|CAA23748.1 Homo sapiens (human) alpha globin + """ + ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCG + CACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACC + AAGACCTACTTCCCGCACTTCGACCTGAGCCACGGCTCTGCCCAAGTTAAGGGCCACGGC + AAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCG + CTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAG + CTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCT + GCGGTGCACGCTTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAA + TACCGTTAA""", + # >ENA|CAA24095|CAA24095.1 Mus musculus (house mouse) alpha-globin + """ATGGTGCTCTCTGGGGAAGACAAAAGCAACATCAAGGCTGCCTGGGGGAAGATTGGTGGC + CATGGTGCTGAATATGGAGCTGAAGCCCTGGAAAGGATGTTTGCTAGCTTCCCCACCACC + AAGACCTACTTTCCTCACTTTGATGTAAGCCACGGCTCTGCCCAGGTCAAGGGTCACGGC + AAGAAGGTCGCCGATGCGCTGGCCAGTGCTGCAGGCCACCTCGATGACCTGCCCGGTGCC + TTGTCTGCTCTGAGCGACCTGCATGCCCACAAGCTGCGTGTGGATCCCGTCAACTTCAAG + CTCCTGAGCCACTGCCTGCTGGTGACCTTGGCTAGCCACCACCCTGCCGATTTCACCCCC + GCGGTACATGCCTCTCTGGACAAATTCCTTGCCTCTGTGAGCACCGTGCTGACCTCCAAG + TACCGTTAA""", + # >ENA|CAA28435|CAA28435.1 Capra hircus (goat) alpha-globin + """ATGTCTCTGACCAGGACTGAGAGGACCATCATCCTGTCCCTGTGGAGCAAGATCTCCACA + CAGGCAGACGTCATTGGCACCGAGACCCTGGAGAGGCTCTTCTCCTGCTACCCGCAGGCC + AAGACCTACTTCCCGCACTTCGACCTGCACTCGGGCTCCGCGCAGCTGCGCGCGCACGGC + TCCAAGGTGGTGGCCGCCGTGGGCGACGCGGTCAAGAGCATCGACAACGTGACGAGCGCG + CTGTCCAAGCTGAGCGAGCTGCACGCCTACGTGCTGCGCGTGGACCCGGTCAACTTCAAG + TTCCTGTCCCACTGCCTGCTGGTCACGTTGGCCTCGCACTTCCCCGCCGACTTCACGGCC + GACGCGCACGCCGCCTGGGACAAGTTCCTGTCCATCGTGTCCGGCGTCCTGACGGAGAAG + TACCGCTGA""" ] +# SAMPLE_SEQUENCES = [ +# "ATGAGTCTCTCTGATAAGGACA" +# ] + BASE_TO_COL = { 'A': 'green', 'C': 'blue',