From bfb6d0512a8de349616178e76bbf9ab473f31d8a Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Tue, 14 Nov 2023 15:15:28 +0800
Subject: [PATCH 1/3] fix (PVG): when determining downstream node after
 cleaving, the downstream node of a bridge out node with multiple inbond node
 should not be skipped if all of its inbond nodes are in the current bubble

---
 moPepGen/svgraph/PVGNode.py                    |  4 +++-
 moPepGen/svgraph/PeptideVariantGraph.py        |  3 +++
 test/integration/test_call_variant_peptides.py | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/moPepGen/svgraph/PVGNode.py b/moPepGen/svgraph/PVGNode.py
index c712c639..f922b3f6 100644
--- a/moPepGen/svgraph/PVGNode.py
+++ b/moPepGen/svgraph/PVGNode.py
@@ -188,7 +188,9 @@ def _get_nth_rf_index(self, i:int) -> int:
         for v in self.variants:
             if not (v.variant.is_fusion() \
                     or v.variant.is_circ_rna() \
-                    or (v.variant.is_alternative_splicing() and not v.variant.is_deletion())):
+                    or (v.variant.is_alternative_splicing() and not v.variant.is_deletion()) \
+                    or v.downstream_cleavage_altering \
+                    or v.upstream_cleavage_altering):
                 locations.append(v.location)
 
         locations.sort()
diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py
index c2b8b516..f4e28cd7 100644
--- a/moPepGen/svgraph/PeptideVariantGraph.py
+++ b/moPepGen/svgraph/PeptideVariantGraph.py
@@ -364,6 +364,7 @@ def move_downstreams(self, nodes:Iterable[PVGNode], reading_frame_index:int
             if node.get_last_rf_index() != reading_frame_index \
                     and len(node.get_out_nodes()) == 1 \
                     and not node.has_exclusive_outbond_node() \
+                    and not all(x in nodes for x in node.get_out_nodes()[0].get_in_nodes()) \
                     and not len(node.get_out_nodes()[0].get_out_nodes()) == 0 \
                     and not node.get_out_nodes()[0].get_out_nodes()[0].seq.seq == '*':
                 continue
@@ -672,6 +673,8 @@ def create_cleavage_graph(self) -> None:
             if cur.is_already_cleaved() and first_site == -1:
                 continue
 
+            cur_seq = str(cur.seq.seq)
+
             if len(cur.in_nodes) == 1:
                 downstreams, inbridges = self.fit_into_cleavages_single_upstream(cur)
             else:
diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py
index 4246412d..9097af12 100644
--- a/test/integration/test_call_variant_peptides.py
+++ b/test/integration/test_call_variant_peptides.py
@@ -1210,3 +1210,20 @@ def test_call_variant_peptide_case82(self):
         expected = test_dir/'brute_force.txt'
         reference = test_dir
         self.default_test_case(gvf, reference, expected)
+
+    def test_call_variant_peptide_case83(self):
+        """ Issue in graph digestion. When determining the downstream nodes
+        for next iteration after cleaving a bubble, the outbond node of a new
+        created node (by merging or cleaving) is usually skipped if the new
+        node contains frameshifting variants. However, if the outbond node
+        contains multiple inbond nodes, and all of them are created in the
+        current bubble, it should still be processed and identified as a
+        downstream node, otherwise it will remain as uncleaved and resulting
+        potential invalid characters (e.g., *). """
+        test_dir = self.data_dir/'fuzz/52'
+        gvf = [
+            test_dir/'fake_variants.gvf'
+        ]
+        expected = test_dir/'brute_force.txt'
+        reference = test_dir
+        self.default_test_case(gvf, reference, expected)

From 7da24d22df660f6919af15e17bafee834a9c1d3f Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Tue, 14 Nov 2023 15:23:28 +0800
Subject: [PATCH 2/3] fix (test): test files added

---
 test/files/fuzz/52/annotation.gtf             | 12 ++++
 test/files/fuzz/52/brute_force.txt            | 61 +++++++++++++++++++
 test/files/fuzz/52/fake_variants.gvf          | 29 +++++++++
 test/files/fuzz/52/genome.fasta               | 17 ++++++
 test/files/fuzz/52/proteome.fasta             |  2 +
 .../integration/test_call_variant_peptides.py | 16 ++---
 6 files changed, 129 insertions(+), 8 deletions(-)
 create mode 100644 test/files/fuzz/52/annotation.gtf
 create mode 100644 test/files/fuzz/52/brute_force.txt
 create mode 100644 test/files/fuzz/52/fake_variants.gvf
 create mode 100644 test/files/fuzz/52/genome.fasta
 create mode 100644 test/files/fuzz/52/proteome.fasta

diff --git a/test/files/fuzz/52/annotation.gtf b/test/files/fuzz/52/annotation.gtf
new file mode 100644
index 00000000..99d94aad
--- /dev/null
+++ b/test/files/fuzz/52/annotation.gtf
@@ -0,0 +1,12 @@
+chr1	.	gene	1	929	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	transcript	1	929	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF; is_protein_coding true;
+chr1	.	selenocysteine	126	128	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	selenocysteine	132	134	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	selenocysteine	186	188	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	exon	1	219	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	CDS	114	219	.	-	1	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	CDS	507	545	.	-	1	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	exon	507	545	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	CDS	917	929	.	-	2	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	exon	917	929	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
+chr1	.	UTR	1	113	.	-	.	 gene_id FAKEG00000896; transcript_id FAKET00000896; protein_id FAKEP00000896; tag cds_start_NF;
diff --git a/test/files/fuzz/52/brute_force.txt b/test/files/fuzz/52/brute_force.txt
new file mode 100644
index 00000000..b30261cd
--- /dev/null
+++ b/test/files/fuzz/52/brute_force.txt
@@ -0,0 +1,61 @@
+AYRRDVDCR
+CAGPGGNATK
+CAGPGGNATKS
+DVDCRGIPLSYYIPRPSR
+GIPLSYYIPRPSR
+GSISLKVCK
+GSISLKVCKMR
+GSISLKVCKMSWPR
+GSISLMR
+GSISLMRWPR
+GSISLMRWPRGK
+GSISLMSWPR
+GSISLMSWPRGK
+GSISLMSWPRGKCY
+GSMSLKVCK
+GSMSLKVCKMR
+GSMSLKVCKMSWPR
+GSMSLMR
+GSMSLMRWPR
+GSMSLMRWPRGK
+GSMSLMSWPR
+GSMSLMSWPRGK
+GSMSLMSWPRGKCY
+MRWPRGK
+MSWPRGK
+MSWPRGKCY
+RDVDCRGIPLSYYIPRPSR
+RRTFPUPVTGLLSHYGCYWSTVA
+RRTFPUPVTGLLSHYGCYWSTVAR
+RRTFPUPVTGLLSHYGCYWSTVAUA
+RRTFPUPVTGLQSHYGCYWSTVA
+RRTFPUPVTGLQSHYGCYWSTVAR
+RRTFPUPVTGLQSHYGCYWSTVAUA
+RRTFPUPVTMVVTGLQWPVVA
+RRTFPUPVTMVVTGLQWPVVAUA
+RRTFPUPVTMVVTGLQWPVVAUAUR
+RTFPUPVTGLLSHYGCYWSTVA
+RTFPUPVTGLLSHYGCYWSTVAR
+RTFPUPVTGLLSHYGCYWSTVAUA
+RTFPUPVTGLQSHYGCYWSTVA
+RTFPUPVTGLQSHYGCYWSTVAR
+RTFPUPVTGLQSHYGCYWSTVAUA
+RTFPUPVTMVVTGLQWPVVA
+RTFPUPVTMVVTGLQWPVVAUA
+RTFPUPVTMVVTGLQWPVVAUAUR
+SAKCAGPGGNATK
+SAKCAGPGGNATKS
+TFPUPVTGLLSHYGCYWSTVA
+TFPUPVTGLLSHYGCYWSTVAR
+TFPUPVTGLLSHYGCYWSTVAUA
+TFPUPVTGLLSHYGCYWSTVAUAUR
+TFPUPVTGLQSHYGCYWSTVA
+TFPUPVTGLQSHYGCYWSTVAR
+TFPUPVTGLQSHYGCYWSTVAUA
+TFPUPVTMVVTGLQWPVVA
+TFPUPVTMVVTGLQWPVVAUA
+TFPUPVTMVVTGLQWPVVAUAUR
+VCKMRWPR
+VCKMSWPR
+VCKMSWPRGK
+WPRGKCY
diff --git a/test/files/fuzz/52/fake_variants.gvf b/test/files/fuzz/52/fake_variants.gvf
new file mode 100644
index 00000000..b754f426
--- /dev/null
+++ b/test/files/fuzz/52/fake_variants.gvf
@@ -0,0 +1,29 @@
+##fileformat=VCFv4.2
+##mopepgen_version=1.2.1
+##parser=parseVEP
+##reference_index=
+##genome_fasta=
+##annotation_gtf=
+##source=
+##CHROM=<Description="Gene ID">
+##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
+##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
+##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
+##INFO=<ID=START,Number=1,Type=Integer,Description="Start Position">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End Position">
+##INFO=<ID=DONOR_START,Number=1,Type=Integer,Description="Donor Start Position">
+##INFO=<ID=DONOR_END,Number=1,Type=Integer,Description="Donor End Position">
+##INFO=<ID=COORDINATE,Number=1,Type=String,Description="Coordinate for Insertion or Substitution">
+##INFO=<ID=ACCEPTER_GENE_ID,Number=1,Type=String,Description="3' Accepter Transcript's Gene ID">
+##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
+##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
+##INFO=<ID=OFFSET,Number=+,Type=Integer,Description="Offsets of fragments (exons or introns)">
+##INFO=<ID=LENGTH,Number=+,Type=Integer,Description="Lengths of fragments (exons or introns)">
+##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+FAKEG00000896	753	FAKEG00000896-752-GGGACTACAGTCACATT-G	GGGACTACAGTCACATT	G	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-176:161;GENE_SYMBOL=
+FAKEG00000896	761	FAKEG00000896-760-AG-TT	AG	TT	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-168:169;GENE_SYMBOL=
+FAKEG00000896	793	FAKEG00000896-792-G-GCCCGTGGTAG	G	GCCCGTGGTAG	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-136:137;GENE_SYMBOL=
+FAKEG00000896	797	FAKEG00000896-796-G-A	G	A	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-132:133;GENE_SYMBOL=
+FAKEG00000896	803	FAKEG00000896-802-GAAGGTCTGCAAA-G	GAAGGTCTGCAAA	G	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-126:115;GENE_SYMBOL=
+FAKEG00000896	819	FAKEG00000896-818-A-C	A	C	.	.	TRANSCRIPT_ID=FAKET00000896;GENOMIC_POSITION=chrF-110:111;GENE_SYMBOL=
diff --git a/test/files/fuzz/52/genome.fasta b/test/files/fuzz/52/genome.fasta
new file mode 100644
index 00000000..60230a0d
--- /dev/null
+++ b/test/files/fuzz/52/genome.fasta
@@ -0,0 +1,17 @@
+>chr1
+TTAGTCGTTATCTGGACGGCCGAGGGATGTAATATGATAAGGGGATCCCCCTACAATCTA
+CATCTCGCCTGTACGCTTTAGGACTTAGTAGCATTTCCCCCTGGGCCAGCTCATTTTGCA
+GACCTTCAGGCTCATGCCACTGTAGACCAGTAACAACCATAATGTGACTGTAGTCCCGTA
+ACGGGTCACGGGAAAGTCCGACGCTTTTTATCCCTCCACTCCCCAGTAAAGGTAGCTTGA
+TAAGAACATAGAGGAGGATCAAACCTTAGGGGAGTATTAGGCGCTTCGAGCATAGGGGTC
+GAAGCCGTGTTGGGTTAAGAGCTAACGGACATCGTCATTGCGTTCGCGACCCGATGGCGC
+CACGCATATTATTTGCCCGCAAGGACCTCATTGTCTACAACCGTCGCCCCCGGCCTCGGT
+GTAATCTCGCTTAAGCACTACGCTATTAAGGGACGTATGGCGGGCAATTGTATCGCTGAG
+TGGAGGCTCGACCAACTGTAAGTACGGTCGCGTGCGTCTGCACCGTGTCAAACGTGCCTC
+CGTTCGGAAGCATTGAACAACCTTACTGATATGAGGTAATCGCCGATTAAGTGGGCTTGA
+CGTACACCGGCGCGCATTCCGAAGTAAGCGGGTTAGATCAATATCCCGATCATGATCGGC
+CGTTCTTTAGGACCCGGTGGAGACTGGCCTGGTTCGCAAATATGTCGTCTTCGACTCGCC
+CCATACTGATGCGATGTAACAAGTGCCTATGACACTCTATAGGTGCCCACCATATGGCAC
+TTCCAGTTCAGGAGGTGTGAAAGACTTCCTAGTCAGGCCAAAGCTACTCCATAATAGTGC
+GTTTTAAGTAAACCATAAACTTGATGGTGTAGCTACAATTAGACTTGCGAACTCCCTTAG
+TTGGGACGGTGCCTCCAAGTCCATCATAT
diff --git a/test/files/fuzz/52/proteome.fasta b/test/files/fuzz/52/proteome.fasta
new file mode 100644
index 00000000..43ba434e
--- /dev/null
+++ b/test/files/fuzz/52/proteome.fasta
@@ -0,0 +1,2 @@
+>FAKEP00000896|FAKET00000896|FAKEG00000896|XXX
+MMDLNGGTFDTVQTHATWRDKKRRTFPUPVTGLQSHYGCYWSTVAUAURSAK
diff --git a/test/integration/test_call_variant_peptides.py b/test/integration/test_call_variant_peptides.py
index 9097af12..fc8ba727 100644
--- a/test/integration/test_call_variant_peptides.py
+++ b/test/integration/test_call_variant_peptides.py
@@ -1212,14 +1212,14 @@ def test_call_variant_peptide_case82(self):
         self.default_test_case(gvf, reference, expected)
 
     def test_call_variant_peptide_case83(self):
-        """ Issue in graph digestion. When determining the downstream nodes
-        for next iteration after cleaving a bubble, the outbond node of a new
-        created node (by merging or cleaving) is usually skipped if the new
-        node contains frameshifting variants. However, if the outbond node
-        contains multiple inbond nodes, and all of them are created in the
-        current bubble, it should still be processed and identified as a
-        downstream node, otherwise it will remain as uncleaved and resulting
-        potential invalid characters (e.g., *). """
+        """ Issue in graph digestion. When determining the downstream nodes for
+        the next iteration after cleaving a bubble, the outbound node of a newly
+        created node (by merging or cleaving) is usually skipped if the new node
+        contains frameshifting variants. However, if the outbound node contains
+        multiple inbound nodes, and all of them are created in the current bubble,
+        it should still be processed and identified as a downstream node, otherwise,
+        it will remain as uncleaved and result in potential invalid characters
+        (e.g., *). """
         test_dir = self.data_dir/'fuzz/52'
         gvf = [
             test_dir/'fake_variants.gvf'

From 24c373ad8b99179adeda7d0820836e29287a8ad2 Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Tue, 14 Nov 2023 20:09:52 +0800
Subject: [PATCH 3/3] fix (PVG): unused variable remvoed

---
 moPepGen/svgraph/PeptideVariantGraph.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/moPepGen/svgraph/PeptideVariantGraph.py b/moPepGen/svgraph/PeptideVariantGraph.py
index f4e28cd7..bb86874b 100644
--- a/moPepGen/svgraph/PeptideVariantGraph.py
+++ b/moPepGen/svgraph/PeptideVariantGraph.py
@@ -673,8 +673,6 @@ def create_cleavage_graph(self) -> None:
             if cur.is_already_cleaved() and first_site == -1:
                 continue
 
-            cur_seq = str(cur.seq.seq)
-
             if len(cur.in_nodes) == 1:
                 downstreams, inbridges = self.fit_into_cleavages_single_upstream(cur)
             else: