diff --git a/.travis.yml b/.travis.yml index 9bf07a7..0b82495 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,12 +36,15 @@ install: - pip install . - pip install coveralls script: - # human releases + # older human Ensembl releases - pyensembl install --release 75 --species human - pyensembl install --release 77 --species human - pyensembl install --release 81 --species human - # mouse releases + # latest human Ensembl release + - pyensembl install --release 83 --species human + # mouse tests written for mouse Ensembl #81 - pyensembl install --release 81 --species mouse + # now actually run the tests, generate a coverage report and run linter - nosetests test --with-coverage --cover-package=varcode && ./lint.sh after_success: coveralls diff --git a/requirements.txt b/requirements.txt index 7484a77..f424854 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.7, <2.0 pandas>=0.15 -pyensembl>=0.8.2 +pyensembl>=0.8.8 biopython>=1.64 pyvcf>=0.6.7 memoized_property>=1.0.2 diff --git a/setup.py b/setup.py index d345489..49b142b 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ setup( name='varcode', packages=find_packages(), - version="0.4.1", + version="0.4.2", description="Variant annotation in Python", long_description=readme, url="https://github.com/hammerlab/varcode", @@ -58,7 +58,7 @@ install_requires=[ 'numpy>=1.7, <2.0', 'pandas>=0.15', - 'pyensembl>=0.8.2', + 'pyensembl>=0.8.8', 'biopython>=1.64', 'pyvcf>=0.6.7', 'memoized_property>=1.0.2', diff --git a/test/benchmark_vcf_load.py b/test/benchmark_vcf_load.py index 00e2ab6..af1476d 100644 --- a/test/benchmark_vcf_load.py +++ b/test/benchmark_vcf_load.py @@ -13,13 +13,26 @@ import varcode parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument("path", help="Path or URL to VCF") -parser.add_argument("--profile", action="store_true", default=False, + +parser.add_argument( + "path", help="Path or URL to VCF") + +parser.add_argument( + "--profile", action="store_true", + default=False, help="Run in a profiler.") -parser.add_argument("--no-info-field", - dest="info_field", action="store_false", default=True) -parser.add_argument("--pyvcf", - help="use pyvcf implementation", action="store_true", default=False) + +parser.add_argument( + "--no-info-field", + dest="info_field", + action="store_false", + default=True) + +parser.add_argument( + "--pyvcf", + help="use pyvcf implementation", + action="store_true", + default=False) def run(): args = parser.parse_args() @@ -29,7 +42,7 @@ def run(): extra_args["include_info"] = False start = time.time() - + if args.pyvcf: result = varcode.load_vcf( args.path, diff --git a/test/data.py b/test/data.py index c3fa812..0335921 100644 --- a/test/data.py +++ b/test/data.py @@ -51,4 +51,4 @@ def data_path(name): snp_rs4244285, snp_rs1537415, snp_rs3892097, -]) \ No newline at end of file +]) diff --git a/test/test_dbnsfp_validation.py b/test/test_dbnsfp_validation.py index 8808e06..9ee3e32 100644 --- a/test/test_dbnsfp_validation.py +++ b/test/test_dbnsfp_validation.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pyensembl import EnsemblRelease +from pyensembl import ensembl_grch37 from varcode import ( ExonicSpliceSite, Substitution, @@ -23,8 +23,6 @@ from . import data_path -ensembl = EnsemblRelease(75) - def validate_transcript_mutation( ensembl_transcript_id, chrom, @@ -33,7 +31,7 @@ def validate_transcript_mutation( dna_alt, aa_pos, aa_alt): - variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) + variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect @@ -58,14 +56,14 @@ def validate_transcript_mutation( assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ - "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( - aa_alt, - aa_pos, - chrom, - dna_position, - dna_ref, - dna_alt, - effect) + "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( + aa_alt, + aa_pos, + chrom, + dna_position, + dna_ref, + dna_alt, + effect) def test_dbnsfp_validation_set(): # check that amino acid substitution gives diff --git a/test/test_effect_classes.py b/test/test_effect_classes.py index 178c978..63ff8a6 100644 --- a/test/test_effect_classes.py +++ b/test/test_effect_classes.py @@ -24,7 +24,7 @@ # transcript effects # IncompleteTranscript, - NoncodingTranscript, + # NoncodingTranscript, TODO: write a noncoding transcript test FivePrimeUTR, ThreePrimeUTR, Intronic, @@ -41,10 +41,14 @@ ExonicSpliceSite, # TODO: SpliceDonor, SpliceReceptor ) -from pyensembl import ensembl_grch37, ensembl_grch38 +from pyensembl import ensembl_grch37, cached_release from .common import expect_effect +# tried using more recent releases but found that many of them +# are very specific to Ensembl data between releases 77-81 +ensembl_grch38 = cached_release(81) + def test_incomplete(): # transcript EGFR-009 (ENST00000450046 in Ensembl 78) # has an incomplete 3' end @@ -153,9 +157,9 @@ def test_exon_loss(): "17", 43082404, ref="".join([ - "CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG", - "GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT", - "TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT" + "CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG", + "GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT", + "TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT" ]), alt="", ensembl=ensembl_grch38) diff --git a/test/test_timings.py b/test/test_timings.py index 5044aa1..92c3cb9 100644 --- a/test/test_timings.py +++ b/test/test_timings.py @@ -22,7 +22,6 @@ def _time_variant_annotation(variant_collection): effects = variant_collection.effects() end_t = time.time() assert len(effects.groupby_variant()) == len(variant_collection) - elapsed_t = end_t - start_t return elapsed_t @@ -30,7 +29,7 @@ def _time_variant_annotation(variant_collection): def test_effect_timing( n_variants=100, random_seed=0, - n_warmup_variants=20): + n_warmup_variants=5): warmup_collection = random_variants( n_warmup_variants, random_seed=None) diff --git a/test/test_variant.py b/test/test_variant.py index a78227a..ab9033d 100644 --- a/test/test_variant.py +++ b/test/test_variant.py @@ -20,7 +20,9 @@ import cPickle as pickle except ImportError: import pickle -from pyensembl import ensembl77 + +from pyensembl import ensembl_grch38 + from varcode import Variant from nose.tools import eq_ @@ -106,7 +108,7 @@ def test_deletion_no_suffix(): def test_serialization(): variants = [ Variant( - 1, start=10, ref="AA", alt="AAT", ensembl=ensembl77), + 1, start=10, ref="AA", alt="AAT", ensembl=ensembl_grch38), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] diff --git a/test/test_variant_collection.py b/test/test_variant_collection.py index 0b01eec..004a1e8 100644 --- a/test/test_variant_collection.py +++ b/test/test_variant_collection.py @@ -65,12 +65,13 @@ def test_gene_counts(): # eq_(coding_gene_counts, expected_counts) def test_serialization(): - original = VariantCollection([ + original = VariantCollection( + [ Variant( 1, start=10, ref="AA", alt="AAT", ensembl=77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), - ]) + ]) original.metadata[original[0]] = {"a": "b"} original.metadata[original[2]] = {"bar": 2} diff --git a/test/test_vcf.py b/test/test_vcf.py index 96ca45c..a0043f4 100644 --- a/test/test_vcf.py +++ b/test/test_vcf.py @@ -78,8 +78,7 @@ def test_pandas_and_pyvcf_implementations_equivalent(): {'path': data_path("multiallelic.vcf")}, {'path': data_path("mutect-example.vcf")}, {'path': data_path("strelka-example.vcf")}, - {'path': data_path("mutect-example-headerless.vcf"), - 'genome': cached_release(75)}, + {'path': data_path("mutect-example-headerless.vcf"), 'genome': cached_release(75)}, ] if RUN_TESTS_REQUIRING_INTERNET: paths.append({'path': VCF_EXTERNAL_URL}) diff --git a/varcode/effect_collection.py b/varcode/effect_collection.py index b1f3ad7..ae3ee2e 100644 --- a/varcode/effect_collection.py +++ b/varcode/effect_collection.py @@ -19,7 +19,7 @@ from .collection import Collection from .common import memoize -from .effects import MutationEffect, NonsilentCodingMutation +from .effects import MutationEffect from .effect_ordering import ( effect_priority, effect_sort_key, @@ -143,8 +143,7 @@ def drop_silent_and_noncoding(self): """ Create a new EffectCollection containing only non-silent coding effects """ - return self.filter( - lambda effect: isinstance(effect, NonsilentCodingMutation)) + return self.filter(lambda effect: effect.modifies_protein_sequence) def detailed_string(self): """ diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py index 4d0f175..bf4d3a2 100644 --- a/varcode/effect_ordering.py +++ b/varcode/effect_ordering.py @@ -12,13 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .effects import * +from .effects import ( + Failure, + IncompleteTranscript, + Intergenic, + Intragenic, + NoncodingTranscript, + Intronic, + ThreePrimeUTR, + FivePrimeUTR, + Silent, + Substitution, + Insertion, + Deletion, + ComplexSubstitution, + AlternateStartCodon, + IntronicSpliceSite, + ExonicSpliceSite, + StopLoss, + SpliceDonor, + SpliceAcceptor, + PrematureStop, + FrameShiftTruncation, + StartLoss, + FrameShift, + ExonLoss, +) transcript_effect_priority_list = [ Failure, + IncompleteTranscript, Intergenic, Intragenic, - IncompleteTranscript, NoncodingTranscript, Intronic, ThreePrimeUTR, diff --git a/varcode/effects.py b/varcode/effects.py index e22982f..04d5007 100644 --- a/varcode/effects.py +++ b/varcode/effects.py @@ -261,6 +261,14 @@ def mutant_protein_sequence(self): """ return self.alternate_effect.mutant_protein_sequence + @memoized_property + def modifies_protein_sequence(self): + return self.alternate_effect.modifies_protein_sequence + + @memoized_property + def modifies_coding_sequence(self): + return self.alternate_effect.modifies_coding_sequence + class CodingMutation(Exonic): """ Base class for all mutations which result in a modified coding sequence. @@ -410,9 +418,9 @@ def short_description(self): self.aa_mutation_start_offset) else: return "p.%s%d%s" % ( - self.aa_ref, - self.aa_mutation_start_offset + 1, - self.aa_alt) + self.aa_ref, + self.aa_mutation_start_offset + 1, + self.aa_alt) @memoized_property def mutant_protein_sequence(self): diff --git a/varcode/util.py b/varcode/util.py index a16994d..9940299 100644 --- a/varcode/util.py +++ b/varcode/util.py @@ -14,7 +14,6 @@ from __future__ import print_function, division, absolute_import import random -import logging from Bio.Seq import reverse_complement from pyensembl import EnsemblRelease @@ -48,45 +47,46 @@ def random_variants( variants = [] - while len(variants) < count: - transcript_id = rng.choice(transcript_ids) - transcript = ensembl.transcript_by_id(transcript_id) + # we should finish way before this loop is over but just in case + # something is wrong with PyEnsembl we want to avoid an infinite loop + for _ in range(count * 100): + if len(variants) < count: + transcript_id = rng.choice(transcript_ids) + transcript = ensembl.transcript_by_id(transcript_id) - if not transcript.complete: - continue + if not transcript.complete: + continue - exon = rng.choice(transcript.exons) - base1_genomic_position = rng.randint(exon.start, exon.end) - transcript_offset = transcript.spliced_offset(base1_genomic_position) - - try: + exon = rng.choice(transcript.exons) + base1_genomic_position = rng.randint(exon.start, exon.end) + transcript_offset = transcript.spliced_offset(base1_genomic_position) seq = transcript.sequence - except ValueError as e: - logging.warn(e) - # can't get sequence for non-coding transcripts - continue - ref = str(seq[transcript_offset]) - if transcript.on_backward_strand: - ref = reverse_complement(ref) + ref = str(seq[transcript_offset]) + if transcript.on_backward_strand: + ref = reverse_complement(ref) - alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] + alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] - if insertions: - nucleotide_pairs = [ - x + y - for x in STANDARD_NUCLEOTIDES - for y in STANDARD_NUCLEOTIDES - ] - alt_nucleotides.extend(nucleotide_pairs) - if deletions: - alt_nucleotides.append("") - alt = rng.choice(alt_nucleotides) - variant = Variant( - transcript.contig, - base1_genomic_position, - ref=ref, - alt=alt, - ensembl=ensembl) - variants.append(variant) - return VariantCollection(variants) + if insertions: + nucleotide_pairs = [ + x + y + for x in STANDARD_NUCLEOTIDES + for y in STANDARD_NUCLEOTIDES + ] + alt_nucleotides.extend(nucleotide_pairs) + if deletions: + alt_nucleotides.append("") + alt = rng.choice(alt_nucleotides) + variant = Variant( + transcript.contig, + base1_genomic_position, + ref=ref, + alt=alt, + ensembl=ensembl) + variants.append(variant) + else: + return VariantCollection(variants) + raise ValueError( + ("Unable to generate %d random variants, " + "there may be a problem with PyEnsembl") % count)