Merge pull request #879 from uclahs-cds/czhu-fix-circ-rna

Use backplicing site for circRNA ID
uclahs-cds · Aug 15, 2024 · 5d8b0f7 · 5d8b0f7
2 parents 2194fb6 + e2a6266
commit 5d8b0f7
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 181 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Fixed `VariantPeptideIdentifier` that ORF ID was added before variant IDs.
 
+### Changed
+
+- Changed variant IDs for circRNAs to use backsplicing site instead of listing all exons/introns. #878
+
 ## [1.4.2] - 2024-06-23
 
 - Fixed `splitFasta` that NovelORF peptides coding transcripts not recognized correctly.

diff --git a/docs/file-format.md b/docs/file-format.md
@@ -192,12 +192,9 @@ Circular RNAs are derived from back-spliced exons and introns. They exist as ind
 ##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
 ##POS=<Description="Gene coordinate of circRNA start">
 #CHROM    POS  ID                            REF  ALT  QUAL  FILTER  INFO
-ENSG0001  413  CIRC-ENST0001-E2-E3-E4        .    .    .     .       OFFSET=0,211,398;LENGTH=72,85,63;INTRON=;TRANSCRIPT=ENST0001;GENE_SYMBOL=SYMB1
-ENSG0002  112  CIRC-ENST0001-E3-E4           .    .    .     .       OFFSET=0,175LENGTH=72,85;INTRON=;TRANSCRIPT=ENST0001;GENE_SYMBOL=SYMB2
-ENSG0002  112  CIRC-ENST0001-E3-I3-E4        .    .    .     .       OFFSET=0,73,175;LENGTH=72,103,85;INTRON=;TRANSCRIPT=ENST0001,ENST0012;GENE_SYMBOL=SYMB2
-ENSG0003  77   CIRC-ENST0003-E2-E3-E4        .    .    .     .       OFFSET=0,181,424;LENGTH=100,175,85;INTRON=;TRANSCRIPT=ENST0003;GENE_SYMBOL=SYMB3
-ENSG0003  77   CIRC-ENST0003-E2-I2-E3-I3-E4  .    .    .     .       OFFSET=0,101,181,357,424;LENGTH=100,80,175,67,85;INTRON=;TRANSCRIPT=ENST0003;GENE_SYMBOL=SYMB3
-ENSG0004  789  CI-ENST0004-I3                .    .    .     .       OFFSET=0;LENGTH=112;INTRON=1;TRNASCRIPT=ENST0004;GENE_SYMBOL=SYMB4
+ENSG00000128408.9	0	CIRC-ENST00000614167.2-0:464	.	.	.	.	OFFSET=0,323;LENGTH=323,82;INTRON=;TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:0:464
+ENSG00000099949.21	0	CIRC-ENST00000642151.1-0:197	.	.	.	.	OFFSET=0,98;LENGTH=78,42;INTRON=;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:4980:5177
+ENSG00000099949.21	78	CIRC-ENST00000642151.1-78:98	.	.	.	.	OFFSET=0;LENGTH=20;INTRON=0;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:5058:5078
 ```
 
 circRNAs are not variants that are added to the transcript variant graph, thus the `REF` and `ALT` columns should be kept empty as ".". The `INFO` column must contain the following fields.
@@ -208,7 +205,7 @@ circRNAs are not variants that are added to the transcript variant graph, thus t
 + **`TRANSCRIPT`** The transcript ID of a transcript that is able to generate this circRNA (e.g. contains all exons and introns of the circRNA).
 + **`GENE_SYMBOL`** The name of the gene.
 
-The ID of circRNAs consists of two components. They all start with \<transcript_id>-circRNA or \<transcript_id>-ciRNA where `transcript_id` is the value from the `CHROM` column. Following that is the information for each fragment, indicating whether it is an exon (E) or intron (I) and the index of the fragment. For example, `ENSG0001-circRNA-E2-I2-E3` is made up of the second exon, the second intron, and the third exon of the gene ENSG0001.
+circRNAs IDs follow the format CIRC-\<transcript_id>-<upstrea>:<downstream>, where \<transcript_id> is taken from the INFO column, and \<upstream> and \<downstream> represent the upstream and downstream gene coordinates of the backsplicing site. For example, `CIRC-ENST0001.1-78:135` refers to a circRNA derived from transcript ENST0001.1 with backsplicing occurring between gene positions 78 and 135.
 
 ## 2 Variant Peptide FASTA
 
@@ -232,8 +229,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 For circRNA, the FASTA headers follow this style: `<circRNA-ID>|<variant_id_1>|...|<variant_id_k>|<index>`
 
 ```
->ENSG0001-circRNA-E3-E4|1
+>CIRC-ENSG0001-15:68|1
 XXXXXXXXXXXXX
->ENSG0001-circRNA-E3-E4|SNV-110-C-A|2
+>CIRC-ENSG0001-153:285|SNV-110-C-A|2
 XXXXXXXXXXXXX
 ```
diff --git a/moPepGen/circ/CircRNA.py b/moPepGen/circ/CircRNA.py
@@ -21,7 +21,7 @@ class CircRNAModel():
     """
     def __init__(self, transcript_id:str, fragments:List[SeqFeature],
             intron:List[int], _id:str, gene_id:str, gene_name:str,
-            genomic_location:str=''):
+            genomic_location:str='', backsplicing_site:FeatureLocation=None):
         """ Constructor """
         self.gene_id = gene_id
         self.fragments = fragments
@@ -31,6 +31,7 @@ def __init__(self, transcript_id:str, fragments:List[SeqFeature],
         self.gene_name = gene_name
         self.gene_locations = []
         self.genomic_position = genomic_location
+        self.backsplicing_site = backsplicing_site
 
     def get_gene_coordinates(self, gene:GeneAnnotationModel) -> None:
         """ Get the coordinates of the gene """

diff --git a/moPepGen/parser/CIRCexplorerParser.py b/moPepGen/parser/CIRCexplorerParser.py
@@ -50,14 +50,12 @@ def convert_to_circ_rna(self, anno:gtf.GenomicAnnotation,
 
         if self.circ_type == 'circRNA':
             fragment_type = 'exon'
-            circ_id = 'CIRC'
         elif self.circ_type == 'ciRNA':
             fragment_type = 'intron'
-            circ_id = 'CI'
         else:
             raise ValueError(f'circRNA type unsupported: {self.circ_type}')
 
-        fragment_ids:List[Tuple(fragment, str, int)] = []
+        fragment_ids:List[Tuple[SeqFeature, str, int]] = []
 
         for i, exon_size in enumerate(self.exon_sizes):
             exon_offset = self.exon_offsets[i]
@@ -95,9 +93,19 @@ def convert_to_circ_rna(self, anno:gtf.GenomicAnnotation,
             fragments.append(fragment)
 
         fragment_ids.sort(key=lambda x: x[0])
-        circ_id += f'-{tx_id}-' + '-'.join([f"{t}{i+1}" for _,t,i in fragment_ids])
 
-        genomic_location = f"{self.chrom}:{self.start}"
+        genomic_location = f"{self.chrom}:{self.start}:{self.end}"
+        start_gene = anno.coordinate_genomic_to_gene(self.start, gene_id)
+        end_gene = anno.coordinate_genomic_to_gene(self.end - 1, gene_id)
+        if strand == -1:
+            start_gene, end_gene = end_gene, start_gene
+        end_gene += 1
+        backsplicing_site = FeatureLocation(
+            seqname=tx_model.transcript.chrom,
+            start=start_gene,
+            end=end_gene
+        )
+        circ_id = f"CIRC-{tx_id}-{start_gene}:{end_gene}"
 
         return CircRNAModel(
             transcript_id=tx_id,
@@ -106,7 +114,8 @@ def convert_to_circ_rna(self, anno:gtf.GenomicAnnotation,
             _id=circ_id,
             gene_id=gene_id,
             gene_name=tx_model.transcript.gene_name,
-            genomic_location=genomic_location
+            genomic_location=genomic_location,
+            backsplicing_site=backsplicing_site
         )
 
     def is_valid(self, min_read_number:int) -> bool:

diff --git a/test/files/circRNA/circ_rna.gvf b/test/files/circRNA/circ_rna.gvf
@@ -1,11 +1,11 @@
 ##fileformat=VCFv4.2
-##mopepgen_version=0.3.1
+##mopepgen_version=1.4.3
 ##parser=parseCIRCexplorer
-##reference_index=/hot/users/czhu/private-moPepGen/private-moPepGen-py/test/files/index
+##reference_index=/home/chenghaozhu/projects/package-moPepGen/test/files/index
 ##genome_fasta=
 ##annotation_gtf=
 ##source=circRNA
-##CHROM=<Description='Gene ID'>
+##CHROM=<Description="Gene ID">
 ##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
 ##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
 ##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
@@ -14,6 +14,6 @@
 ##INFO=<ID=INTRON,Number=+,Type=Integer,Description="Indices of fragments that are introns">
 ##POS=<Description="Gene coordinate of circRNA start">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-ENSG00000128408.9	323	CIRC-ENST00000614167.2-E2-E1	.	.	.	.	OFFSET=0,-323;LENGTH=82,323;INTRON=;TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:0
-ENSG00000099949.21	0	CIRC-ENST00000642151.1-E1-E2	.	.	.	.	OFFSET=0,98;LENGTH=78,42;INTRON=;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:4980
-ENSG00000099949.21	78	CI-ENST00000642151.1-I1	.	.	.	.	OFFSET=0;LENGTH=20;INTRON=0;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:5058
+ENSG00000128408.9	0	CIRC-ENST00000614167.2-0:464	.	.	.	.	OFFSET=0,323;LENGTH=323,82;INTRON=;TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:0:464
+ENSG00000099949.21	0	CIRC-ENST00000642151.1-0:197	.	.	.	.	OFFSET=0,98;LENGTH=78,42;INTRON=;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:4980:5177
+ENSG00000099949.21	78	CIRC-ENST00000642151.1-78:98	.	.	.	.	OFFSET=0;LENGTH=20;INTRON=0;TRANSCRIPT_ID=ENST00000642151.1;GENE_SYMBOL=LZTR1;GENOMIC_POSITION=chr22:5058:5078