Merge pull request #52 from EBI-Metagenomics/feature/conversion-prep

Feature/conversion prep
EBI-Metagenomics · Dec 20, 2024 · a6c0d9a · a6c0d9a
2 parents 2be673a + 3e880e1
commit a6c0d9a
Show file tree

Hide file tree

Showing 7 changed files with 385 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@
 - [ Usage ](#usage)
 - [ Test ](#test)
 - [ Outputs ](#out)
+- [Preparing annotations for ENA or GenBank submission](#submission)
 - [ Mobilome annotation ](#mobilome)
 - [ Credits ](#credit)
 - [ Contributions and Support ](#contribute)
@@ -236,6 +237,8 @@ Reference databases
                                                database for version 4.0 on this ftp location:
                                                ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz
   --dbcan_db_version                 [string]  The dbCAN reference database version. [default: 4.1.3_V12]
+  --pseudofinder_db                  [string]  Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder.
+  --pseudofinder_db_version          [string]  SwissProt version. [default: 2024_06]
 
 Generic options
   --multiqc_methods_description      [string]  Custom MultiQC yaml file containing HTML including a methods description.
@@ -461,6 +464,33 @@ The output folders of each individual tool contain select output files of the th
 
 Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output.
 
+<a name="submission"></a>
+
+## Preparing annotations for ENA or GenBank submission
+
+`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `<prefix>_submission.gff` to be used with converters.
+
+### ENA
+
+ENA accepts annotations in the EMBL flat-file format.
+Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are:
+
+- the genome FASTA file
+- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`
+
+Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details.
+
+### GenBank
+
+To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/).
+Three files are required:
+
+- the genome FASTA file
+- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`
+- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/))
+
+More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/).
+
 <a name="mobilome"></a>
 
 ## Mobilome annotation

diff --git a/bin/add_locus_tag_to_trna.py b/bin/add_locus_tag_to_trna.py
@@ -19,7 +19,7 @@
 
 
 def main(input_file, output_file):
-    with open(input_file, "r") as file_in, open(output_file, "w") as file_out:
+    with open(input_file) as file_in, open(output_file, "w") as file_out:
         for line in file_in:
             if line.startswith("#") or not line.strip():
                 # Write header or empty lines as is

diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py
@@ -680,7 +680,7 @@ def get_ncrnas(ncrnas_file):
                 counts += 1
                 contig = cols[3]
                 locus = f"{contig}_ncRNA{counts}"
-                product = cols[-1]
+                product = " ".join(cols[28:])
                 model = cols[2]
                 if model == "RF00005":
                     # Skip tRNAs, we add them from tRNAscan-SE
@@ -692,19 +692,22 @@ def get_ncrnas(ncrnas_file):
                 else:
                     start = int(cols[10])
                     end = int(cols[9])
+                rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
                 annot = [
                     "ID=" + locus,
                     "inference=Rfam:14.9",
                     "locus_tag=" + locus,
                     "product=" + product,
                     "rfam=" + model,
                 ]
+                if ncrna_class:
+                    annot.append(f"ncRNA_class={ncrna_class}")
                 annot = ";".join(annot)
                 newline = "\t".join(
                     [
                         contig,
                         "INFERNAL:1.1.4",
-                        "ncRNA",
+                        rna_feature_name,
                         str(start),
                         str(end),
                         ".",
@@ -719,6 +722,162 @@ def get_ncrnas(ncrnas_file):
     return ncrnas
 
 
+def prepare_rna_gff_fields(cols):
+    rna_feature_name = "ncRNA"
+    if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]:
+        rna_feature_name = "rRNA"
+    ncrna_class = ""
+    rna_types = {
+        "antisense_RNA": [
+            "RF00039",
+            "RF00042",
+            "RF00057",
+            "RF00106",
+            "RF00107",
+            "RF00236",
+            "RF00238",
+            "RF00240",
+            "RF00242",
+            "RF00262",
+            "RF00388",
+            "RF00489",
+            "RF01695",
+            "RF01794",
+            "RF01797",
+            "RF01809",
+            "RF01813",
+            "RF02194",
+            "RF02235",
+            "RF02236",
+            "RF02237",
+            "RF02238",
+            "RF02239",
+            "RF02519",
+            "RF02550",
+            "RF02558",
+            "RF02559",
+            "RF02560",
+            "RF02563",
+            "RF02592",
+            "RF02662",
+            "RF02674",
+            "RF02735",
+            "RF02743",
+            "RF02792",
+            "RF02793",
+            "RF02812",
+            "RF02818",
+            "RF02819",
+            "RF02820",
+            "RF02839",
+            "RF02843",
+            "RF02844",
+            "RF02846",
+            "RF02850",
+            "RF02851",
+            "RF02855",
+            "RF02873",
+            "RF02874",
+            "RF02875",
+            "RF02876",
+            "RF02891",
+            "RF02892",
+            "RF02903",
+            "RF02908",
+        ],
+        "autocatalytically_spliced_intron": ["RF01807"],
+        "ribozyme": [
+            "RF00621",
+            "RF01787",
+            "RF01788",
+            "RF01865",
+            "RF02678",
+            "RF02679",
+            "RF02681",
+            "RF02682",
+            "RF02684",
+            "RF03154",
+            "RF03160",
+            "RF04188",
+        ],
+        "hammerhead_ribozyme": [
+            "RF00008",
+            "RF00163",
+            "RF02275",
+            "RF02276",
+            "RF02277",
+            "RF03152",
+        ],
+        "RNase_P_RNA": [
+            "RF00009",
+            "RF00010",
+            "RF00011",
+            "RF00373",
+            "RF01577",
+            "RF02357",
+        ],
+        "RNase_MRP_RNA": ["RF00030", "RF02472"],
+        "telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"],
+        "scaRNA": [
+            "RF00231",
+            "RF00283",
+            "RF00286",
+            "RF00422",
+            "RF00423",
+            "RF00424",
+            "RF00426",
+            "RF00427",
+            "RF00478",
+            "RF00492",
+            "RF00553",
+            "RF00564",
+            "RF00565",
+            "RF00582",
+            "RF00601",
+            "RF00602",
+            "RF01268",
+            "RF01295",
+            "RF02665",
+            "RF02666",
+            "RF02667",
+            "RF02668",
+            "RF02669",
+            "RF02670",
+            "RF02718",
+            "RF02719",
+            "RF02720",
+            "RF02721",
+            "RF02722",
+        ],
+        "snRNA": ["RF01802"],
+        "SRP_RNA": [
+            "RF00017",
+            "RF00169",
+            "RF01502",
+            "RF01570",
+            "RF01854",
+            "RF01855",
+            "RF01856",
+            "RF01857",
+            "RF04183",
+        ],
+        "vault_RNA": ["RF00006"],
+        "Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"],
+    }
+
+    if rna_feature_name == "ncRNA":
+        for rna_type, rfams in rna_types.items():
+            if cols[2] in rfams:
+                ncrna_class = rna_type
+                break
+        if not ncrna_class:
+            if "microRNA" in cols[-1]:
+                ncrna_class = "pre_miRNA"
+            else:
+                ncrna_class = "other"
+    return rna_feature_name, ncrna_class
+
+
 def get_trnas(trnas_file):
     trnas = {}
     with open(trnas_file) as f:

diff --git a/bin/circos_plot.py b/bin/circos_plot.py
@@ -114,7 +114,7 @@ def main(
                 if "defense_finder_type" in feature.qualifiers:
                     antiphage_track.genomic_features([feature], fc="orchid")
 
-            elif feature.type in ["tRNA", "ncRNA"]:
+            elif feature.type in ["tRNA", "ncRNA", "rRNA"]:
                 rna_track.genomic_features([feature], fc="darkmagenta")
             elif mobilome and feature.type in [
                 "mobility_island",