Skip to content

Commit

Permalink
Merge pull request #137 from hammerlab/exonic-splice-site-can-be-coding
Browse files Browse the repository at this point in the history
Modest change to filtering of coding mutations include ExonicSpliceSite
  • Loading branch information
iskandr committed Feb 25, 2016
2 parents 9c9b648 + cdc58f0 commit 7d1845c
Show file tree
Hide file tree
Showing 15 changed files with 134 additions and 83 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@ install:
- pip install .
- pip install coveralls
script:
# human releases
# older human Ensembl releases
- pyensembl install --release 75 --species human
- pyensembl install --release 77 --species human
- pyensembl install --release 81 --species human
# mouse releases
# latest human Ensembl release
- pyensembl install --release 83 --species human
# mouse tests written for mouse Ensembl #81
- pyensembl install --release 81 --species mouse
# now actually run the tests, generate a coverage report and run linter
- nosetests test --with-coverage --cover-package=varcode && ./lint.sh
after_success:
coveralls
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy>=1.7, <2.0
pandas>=0.15
pyensembl>=0.8.2
pyensembl>=0.8.8
biopython>=1.64
pyvcf>=0.6.7
memoized_property>=1.0.2
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
setup(
name='varcode',
packages=find_packages(),
version="0.4.1",
version="0.4.2",
description="Variant annotation in Python",
long_description=readme,
url="https://github.com/hammerlab/varcode",
Expand All @@ -58,7 +58,7 @@
install_requires=[
'numpy>=1.7, <2.0',
'pandas>=0.15',
'pyensembl>=0.8.2',
'pyensembl>=0.8.8',
'biopython>=1.64',
'pyvcf>=0.6.7',
'memoized_property>=1.0.2',
Expand Down
27 changes: 20 additions & 7 deletions test/benchmark_vcf_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,26 @@
import varcode

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("path", help="Path or URL to VCF")
parser.add_argument("--profile", action="store_true", default=False,

parser.add_argument(
"path", help="Path or URL to VCF")

parser.add_argument(
"--profile", action="store_true",
default=False,
help="Run in a profiler.")
parser.add_argument("--no-info-field",
dest="info_field", action="store_false", default=True)
parser.add_argument("--pyvcf",
help="use pyvcf implementation", action="store_true", default=False)

parser.add_argument(
"--no-info-field",
dest="info_field",
action="store_false",
default=True)

parser.add_argument(
"--pyvcf",
help="use pyvcf implementation",
action="store_true",
default=False)

def run():
args = parser.parse_args()
Expand All @@ -29,7 +42,7 @@ def run():
extra_args["include_info"] = False

start = time.time()

if args.pyvcf:
result = varcode.load_vcf(
args.path,
Expand Down
2 changes: 1 addition & 1 deletion test/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def data_path(name):
snp_rs4244285,
snp_rs1537415,
snp_rs3892097,
])
])
22 changes: 10 additions & 12 deletions test/test_dbnsfp_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pyensembl import EnsemblRelease
from pyensembl import ensembl_grch37
from varcode import (
ExonicSpliceSite,
Substitution,
Expand All @@ -23,8 +23,6 @@

from . import data_path

ensembl = EnsemblRelease(75)

def validate_transcript_mutation(
ensembl_transcript_id,
chrom,
Expand All @@ -33,7 +31,7 @@ def validate_transcript_mutation(
dna_alt,
aa_pos,
aa_alt):
variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
effects = variant.effects()
transcript_id_dict = {
effect.transcript.id: effect
Expand All @@ -58,14 +56,14 @@ def validate_transcript_mutation(
assert (
effect_aa_pos + 1 == aa_pos and
effect_aa_alt == aa_alt), \
"Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
aa_alt,
aa_pos,
chrom,
dna_position,
dna_ref,
dna_alt,
effect)
"Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
aa_alt,
aa_pos,
chrom,
dna_position,
dna_ref,
dna_alt,
effect)

def test_dbnsfp_validation_set():
# check that amino acid substitution gives
Expand Down
14 changes: 9 additions & 5 deletions test/test_effect_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# transcript effects
#
IncompleteTranscript,
NoncodingTranscript,
# NoncodingTranscript, TODO: write a noncoding transcript test
FivePrimeUTR,
ThreePrimeUTR,
Intronic,
Expand All @@ -41,10 +41,14 @@
ExonicSpliceSite,
# TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import ensembl_grch37, ensembl_grch38
from pyensembl import ensembl_grch37, cached_release

from .common import expect_effect

# tried using more recent releases but found that many of them
# are very specific to Ensembl data between releases 77-81
ensembl_grch38 = cached_release(81)

def test_incomplete():
# transcript EGFR-009 (ENST00000450046 in Ensembl 78)
# has an incomplete 3' end
Expand Down Expand Up @@ -153,9 +157,9 @@ def test_exon_loss():
"17",
43082404,
ref="".join([
"CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG",
"GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT",
"TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT"
"CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG",
"GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT",
"TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT"
]),
alt="",
ensembl=ensembl_grch38)
Expand Down
3 changes: 1 addition & 2 deletions test/test_timings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ def _time_variant_annotation(variant_collection):
effects = variant_collection.effects()
end_t = time.time()
assert len(effects.groupby_variant()) == len(variant_collection)

elapsed_t = end_t - start_t
return elapsed_t


def test_effect_timing(
n_variants=100,
random_seed=0,
n_warmup_variants=20):
n_warmup_variants=5):
warmup_collection = random_variants(
n_warmup_variants,
random_seed=None)
Expand Down
6 changes: 4 additions & 2 deletions test/test_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
import cPickle as pickle
except ImportError:
import pickle
from pyensembl import ensembl77

from pyensembl import ensembl_grch38

from varcode import Variant
from nose.tools import eq_

Expand Down Expand Up @@ -106,7 +108,7 @@ def test_deletion_no_suffix():
def test_serialization():
variants = [
Variant(
1, start=10, ref="AA", alt="AAT", ensembl=ensembl77),
1, start=10, ref="AA", alt="AAT", ensembl=ensembl_grch38),
Variant(10, start=15, ref="A", alt="G"),
Variant(20, start=150, ref="", alt="G"),
]
Expand Down
5 changes: 3 additions & 2 deletions test/test_variant_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,13 @@ def test_gene_counts():
# eq_(coding_gene_counts, expected_counts)

def test_serialization():
original = VariantCollection([
original = VariantCollection(
[
Variant(
1, start=10, ref="AA", alt="AAT", ensembl=77),
Variant(10, start=15, ref="A", alt="G"),
Variant(20, start=150, ref="", alt="G"),
])
])
original.metadata[original[0]] = {"a": "b"}
original.metadata[original[2]] = {"bar": 2}

Expand Down
3 changes: 1 addition & 2 deletions test/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ def test_pandas_and_pyvcf_implementations_equivalent():
{'path': data_path("multiallelic.vcf")},
{'path': data_path("mutect-example.vcf")},
{'path': data_path("strelka-example.vcf")},
{'path': data_path("mutect-example-headerless.vcf"),
'genome': cached_release(75)},
{'path': data_path("mutect-example-headerless.vcf"), 'genome': cached_release(75)},
]
if RUN_TESTS_REQUIRING_INTERNET:
paths.append({'path': VCF_EXTERNAL_URL})
Expand Down
5 changes: 2 additions & 3 deletions varcode/effect_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .collection import Collection
from .common import memoize
from .effects import MutationEffect, NonsilentCodingMutation
from .effects import MutationEffect
from .effect_ordering import (
effect_priority,
effect_sort_key,
Expand Down Expand Up @@ -143,8 +143,7 @@ def drop_silent_and_noncoding(self):
"""
Create a new EffectCollection containing only non-silent coding effects
"""
return self.filter(
lambda effect: isinstance(effect, NonsilentCodingMutation))
return self.filter(lambda effect: effect.modifies_protein_sequence)

def detailed_string(self):
"""
Expand Down
29 changes: 27 additions & 2 deletions varcode/effect_ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,38 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .effects import *
from .effects import (
Failure,
IncompleteTranscript,
Intergenic,
Intragenic,
NoncodingTranscript,
Intronic,
ThreePrimeUTR,
FivePrimeUTR,
Silent,
Substitution,
Insertion,
Deletion,
ComplexSubstitution,
AlternateStartCodon,
IntronicSpliceSite,
ExonicSpliceSite,
StopLoss,
SpliceDonor,
SpliceAcceptor,
PrematureStop,
FrameShiftTruncation,
StartLoss,
FrameShift,
ExonLoss,
)

transcript_effect_priority_list = [
Failure,
IncompleteTranscript,
Intergenic,
Intragenic,
IncompleteTranscript,
NoncodingTranscript,
Intronic,
ThreePrimeUTR,
Expand Down
14 changes: 11 additions & 3 deletions varcode/effects.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,14 @@ def mutant_protein_sequence(self):
"""
return self.alternate_effect.mutant_protein_sequence

@memoized_property
def modifies_protein_sequence(self):
return self.alternate_effect.modifies_protein_sequence

@memoized_property
def modifies_coding_sequence(self):
return self.alternate_effect.modifies_coding_sequence

class CodingMutation(Exonic):
"""
Base class for all mutations which result in a modified coding sequence.
Expand Down Expand Up @@ -410,9 +418,9 @@ def short_description(self):
self.aa_mutation_start_offset)
else:
return "p.%s%d%s" % (
self.aa_ref,
self.aa_mutation_start_offset + 1,
self.aa_alt)
self.aa_ref,
self.aa_mutation_start_offset + 1,
self.aa_alt)

@memoized_property
def mutant_protein_sequence(self):
Expand Down
Loading

0 comments on commit 7d1845c

Please sign in to comment.