From bfb6d0512a8de349616178e76bbf9ab473f31d8a Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 14 Nov 2023 15:15:28 +0800 Subject: [PATCH 1/3] fix (PVG): when determining downstream node after cleaving, the downstream node of a bridge out node with multiple inbond node should not be skipped if all of its inbond nodes are in the current bubble --- moPepGen/svgraph/PVGNode.py | 4 +++- moPepGen/svgraph/PeptideVariantGraph.py | 3 +++ test/integration/test_call_variant_peptides.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py index c712c639..f922b3f6 100644 --- a/moPepGen/svgraph/PVGNode.py +++ b/moPepGen/svgraph/PVGNode.py @@ -188,7 +188,9 @@ def _get_nth_rf_index(self, i:int) -> int: for v in self.variants: if not (v.variant.is_fusion() \ or v.variant.is_circ_rna() \ - or (v.variant.is_alternative_splicing() and not v.variant.is_deletion())): + or (v.variant.is_alternative_splicing() and not v.variant.is_deletion()) \ + or v.downstream_cleavage_altering \ + or v.upstream_cleavage_altering): locations.append(v.location) locations.sort() diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py index c2b8b516..f4e28cd7 100644 --- a/moPepGen/svgraph/PeptideVariantGraph.py +++ b/moPepGen/svgraph/PeptideVariantGraph.py @@ -364,6 +364,7 @@ def move_downstreams(self, nodes:Iterable[PVGNode], reading_frame_index:int if node.get_last_rf_index() != reading_frame_index \ and len(node.get_out_nodes()) == 1 \ and not node.has_exclusive_outbond_node() \ + and not all(x in nodes for x in node.get_out_nodes()[0].get_in_nodes()) \ and not len(node.get_out_nodes()[0].get_out_nodes()) == 0 \ and not node.get_out_nodes()[0].get_out_nodes()[0].seq.seq == '*': continue @@ -672,6 +673,8 @@ def create_cleavage_graph(self) -> None: if cur.is_already_cleaved() and first_site == -1: continue + cur_seq = str(cur.seq.seq) + if len(cur.in_nodes) == 1: downstreams, inbridges = self.fit_into_cleavages_single_upstream(cur) else: diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index 4246412d..9097af12 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -1210,3 +1210,20 @@ def test_call_variant_peptide_case82(self): expected = test_dir/'brute_force.txt' reference = test_dir self.default_test_case(gvf, reference, expected) + + def test_call_variant_peptide_case83(self): + """ Issue in graph digestion. When determining the downstream nodes + for next iteration after cleaving a bubble, the outbond node of a new + created node (by merging or cleaving) is usually skipped if the new + node contains frameshifting variants. However, if the outbond node + contains multiple inbond nodes, and all of them are created in the + current bubble, it should still be processed and identified as a + downstream node, otherwise it will remain as uncleaved and resulting + potential invalid characters (e.g., *). """ + test_dir = self.data_dir/'fuzz/52' + gvf = [ + test_dir/'fake_variants.gvf' + ] + expected = test_dir/'brute_force.txt' + reference = test_dir + self.default_test_case(gvf, reference, expected) From 7da24d22df660f6919af15e17bafee834a9c1d3f Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 14 Nov 2023 15:23:28 +0800 Subject: [PATCH 2/3] fix (test): test files added --- test/files/fuzz/52/annotation.gtf | 12 ++++ test/files/fuzz/52/brute_force.txt | 61 +++++++++++++++++++ test/files/fuzz/52/fake_variants.gvf | 29 +++++++++ test/files/fuzz/52/genome.fasta | 17 ++++++ test/files/fuzz/52/proteome.fasta | 2 + .../integration/test_call_variant_peptides.py | 16 ++--- 6 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 test/files/fuzz/52/annotation.gtf create mode 100644 test/files/fuzz/52/brute_force.txt create mode 100644 test/files/fuzz/52/fake_variants.gvf create mode 100644 test/files/fuzz/52/genome.fasta create mode 100644 test/files/fuzz/52/proteome.fasta diff --git a/test/files/fuzz/52/annotation.gtf b/test/files/fuzz/52/annotation.gtf new file mode 100644 index 00000000..99d94aad --- /dev/null +++ b/test/files/fuzz/52/annotation.gtf @@ -0,0 +1,12 @@ +chr1 . gene 1 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . transcript 1 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; is_protein_coding true; +chr1 . selenocysteine 126 128 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . selenocysteine 132 134 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . selenocysteine 186 188 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . exon 1 219 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . CDS 114 219 . - 1 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . CDS 507 545 . - 1 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . exon 507 545 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . CDS 917 929 . - 2 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . exon 917 929 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; +chr1 . UTR 1 113 . - . gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; diff --git a/test/files/fuzz/52/brute_force.txt b/test/files/fuzz/52/brute_force.txt new file mode 100644 index 00000000..b30261cd --- /dev/null +++ b/test/files/fuzz/52/brute_force.txt @@ -0,0 +1,61 @@ +AYRRDVDCR +CAGPGGNATK +CAGPGGNATKS +DVDCRGIPLSYYIPRPSR +GIPLSYYIPRPSR +GSISLKVCK +GSISLKVCKMR +GSISLKVCKMSWPR +GSISLMR +GSISLMRWPR +GSISLMRWPRGK +GSISLMSWPR +GSISLMSWPRGK +GSISLMSWPRGKCY +GSMSLKVCK +GSMSLKVCKMR +GSMSLKVCKMSWPR +GSMSLMR +GSMSLMRWPR +GSMSLMRWPRGK +GSMSLMSWPR +GSMSLMSWPRGK +GSMSLMSWPRGKCY +MRWPRGK +MSWPRGK +MSWPRGKCY +RDVDCRGIPLSYYIPRPSR +RRTFPUPVTGLLSHYGCYWSTVA +RRTFPUPVTGLLSHYGCYWSTVAR +RRTFPUPVTGLLSHYGCYWSTVAUA +RRTFPUPVTGLQSHYGCYWSTVA +RRTFPUPVTGLQSHYGCYWSTVAR +RRTFPUPVTGLQSHYGCYWSTVAUA +RRTFPUPVTMVVTGLQWPVVA +RRTFPUPVTMVVTGLQWPVVAUA +RRTFPUPVTMVVTGLQWPVVAUAUR +RTFPUPVTGLLSHYGCYWSTVA +RTFPUPVTGLLSHYGCYWSTVAR +RTFPUPVTGLLSHYGCYWSTVAUA +RTFPUPVTGLQSHYGCYWSTVA +RTFPUPVTGLQSHYGCYWSTVAR +RTFPUPVTGLQSHYGCYWSTVAUA +RTFPUPVTMVVTGLQWPVVA +RTFPUPVTMVVTGLQWPVVAUA +RTFPUPVTMVVTGLQWPVVAUAUR +SAKCAGPGGNATK +SAKCAGPGGNATKS +TFPUPVTGLLSHYGCYWSTVA +TFPUPVTGLLSHYGCYWSTVAR +TFPUPVTGLLSHYGCYWSTVAUA +TFPUPVTGLLSHYGCYWSTVAUAUR +TFPUPVTGLQSHYGCYWSTVA +TFPUPVTGLQSHYGCYWSTVAR +TFPUPVTGLQSHYGCYWSTVAUA +TFPUPVTMVVTGLQWPVVA +TFPUPVTMVVTGLQWPVVAUA +TFPUPVTMVVTGLQWPVVAUAUR +VCKMRWPR +VCKMSWPR +VCKMSWPRGK +WPRGKCY diff --git a/test/files/fuzz/52/fake_variants.gvf b/test/files/fuzz/52/fake_variants.gvf new file mode 100644 index 00000000..b754f426 --- /dev/null +++ b/test/files/fuzz/52/fake_variants.gvf @@ -0,0 +1,29 @@ +##fileformat=VCFv4.2 +##mopepgen_version=1.2.1 +##parser=parseVEP +##reference_index= +##genome_fasta= +##annotation_gtf= +##source= +##CHROM= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +FAKEG00000896 753 FAKEG00000896-752-GGGACTACAGTCACATT-G GGGACTACAGTCACATT G . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-176:161;GENE_SYMBOL= +FAKEG00000896 761 FAKEG00000896-760-AG-TT AG TT . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-168:169;GENE_SYMBOL= +FAKEG00000896 793 FAKEG00000896-792-G-GCCCGTGGTAG G GCCCGTGGTAG . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-136:137;GENE_SYMBOL= +FAKEG00000896 797 FAKEG00000896-796-G-A G A . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-132:133;GENE_SYMBOL= +FAKEG00000896 803 FAKEG00000896-802-GAAGGTCTGCAAA-G GAAGGTCTGCAAA G . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-126:115;GENE_SYMBOL= +FAKEG00000896 819 FAKEG00000896-818-A-C A C . . TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-110:111;GENE_SYMBOL= diff --git a/test/files/fuzz/52/genome.fasta b/test/files/fuzz/52/genome.fasta new file mode 100644 index 00000000..60230a0d --- /dev/null +++ b/test/files/fuzz/52/genome.fasta @@ -0,0 +1,17 @@ +>chr1 +TTAGTCGTTATCTGGACGGCCGAGGGATGTAATATGATAAGGGGATCCCCCTACAATCTA +CATCTCGCCTGTACGCTTTAGGACTTAGTAGCATTTCCCCCTGGGCCAGCTCATTTTGCA +GACCTTCAGGCTCATGCCACTGTAGACCAGTAACAACCATAATGTGACTGTAGTCCCGTA +ACGGGTCACGGGAAAGTCCGACGCTTTTTATCCCTCCACTCCCCAGTAAAGGTAGCTTGA +TAAGAACATAGAGGAGGATCAAACCTTAGGGGAGTATTAGGCGCTTCGAGCATAGGGGTC +GAAGCCGTGTTGGGTTAAGAGCTAACGGACATCGTCATTGCGTTCGCGACCCGATGGCGC +CACGCATATTATTTGCCCGCAAGGACCTCATTGTCTACAACCGTCGCCCCCGGCCTCGGT +GTAATCTCGCTTAAGCACTACGCTATTAAGGGACGTATGGCGGGCAATTGTATCGCTGAG +TGGAGGCTCGACCAACTGTAAGTACGGTCGCGTGCGTCTGCACCGTGTCAAACGTGCCTC +CGTTCGGAAGCATTGAACAACCTTACTGATATGAGGTAATCGCCGATTAAGTGGGCTTGA +CGTACACCGGCGCGCATTCCGAAGTAAGCGGGTTAGATCAATATCCCGATCATGATCGGC +CGTTCTTTAGGACCCGGTGGAGACTGGCCTGGTTCGCAAATATGTCGTCTTCGACTCGCC +CCATACTGATGCGATGTAACAAGTGCCTATGACACTCTATAGGTGCCCACCATATGGCAC +TTCCAGTTCAGGAGGTGTGAAAGACTTCCTAGTCAGGCCAAAGCTACTCCATAATAGTGC +GTTTTAAGTAAACCATAAACTTGATGGTGTAGCTACAATTAGACTTGCGAACTCCCTTAG +TTGGGACGGTGCCTCCAAGTCCATCATAT diff --git a/test/files/fuzz/52/proteome.fasta b/test/files/fuzz/52/proteome.fasta new file mode 100644 index 00000000..43ba434e --- /dev/null +++ b/test/files/fuzz/52/proteome.fasta @@ -0,0 +1,2 @@ +>FAKEP00000896|FAKET00000896|FAKEG00000896|XXX +MMDLNGGTFDTVQTHATWRDKKRRTFPUPVTGLQSHYGCYWSTVAUAURSAK diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py index 9097af12..fc8ba727 100644 --- a/test/integration/test_call_variant_peptides.py +++ b/test/integration/test_call_variant_peptides.py @@ -1212,14 +1212,14 @@ def test_call_variant_peptide_case82(self): self.default_test_case(gvf, reference, expected) def test_call_variant_peptide_case83(self): - """ Issue in graph digestion. When determining the downstream nodes - for next iteration after cleaving a bubble, the outbond node of a new - created node (by merging or cleaving) is usually skipped if the new - node contains frameshifting variants. However, if the outbond node - contains multiple inbond nodes, and all of them are created in the - current bubble, it should still be processed and identified as a - downstream node, otherwise it will remain as uncleaved and resulting - potential invalid characters (e.g., *). """ + """ Issue in graph digestion. When determining the downstream nodes for + the next iteration after cleaving a bubble, the outbound node of a newly + created node (by merging or cleaving) is usually skipped if the new node + contains frameshifting variants. However, if the outbound node contains + multiple inbound nodes, and all of them are created in the current bubble, + it should still be processed and identified as a downstream node, otherwise, + it will remain as uncleaved and result in potential invalid characters + (e.g., *). """ test_dir = self.data_dir/'fuzz/52' gvf = [ test_dir/'fake_variants.gvf' From 24c373ad8b99179adeda7d0820836e29287a8ad2 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 14 Nov 2023 20:09:52 +0800 Subject: [PATCH 3/3] fix (PVG): unused variable remvoed --- moPepGen/svgraph/PeptideVariantGraph.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py index f4e28cd7..bb86874b 100644 --- a/moPepGen/svgraph/PeptideVariantGraph.py +++ b/moPepGen/svgraph/PeptideVariantGraph.py @@ -673,8 +673,6 @@ def create_cleavage_graph(self) -> None: if cur.is_already_cleaved() and first_site == -1: continue - cur_seq = str(cur.seq.seq) - if len(cur.in_nodes) == 1: downstreams, inbridges = self.fit_into_cleavages_single_upstream(cur) else: