diff --git a/README.md b/README.md
index 066292d..da773c3 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@
- [ Usage ](#usage)
- [ Test ](#test)
- [ Outputs ](#out)
+- [Preparing annotations for ENA or GenBank submission](#submission)
- [ Mobilome annotation ](#mobilome)
- [ Credits ](#credit)
- [ Contributions and Support ](#contribute)
@@ -236,6 +237,8 @@ Reference databases
database for version 4.0 on this ftp location:
ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz
--dbcan_db_version [string] The dbCAN reference database version. [default: 4.1.3_V12]
+ --pseudofinder_db [string] Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder.
+ --pseudofinder_db_version [string] SwissProt version. [default: 2024_06]
Generic options
--multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description.
@@ -461,6 +464,33 @@ The output folders of each individual tool contain select output files of the th
Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output.
+
+
+## Preparing annotations for ENA or GenBank submission
+
+`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `_submission.gff` to be used with converters.
+
+### ENA
+
+ENA accepts annotations in the EMBL flat-file format.
+Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are:
+
+- the genome FASTA file
+- `//functional_annotation/merged_gff/_submission.gff`
+
+Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details.
+
+### GenBank
+
+To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/).
+Three files are required:
+
+- the genome FASTA file
+- `//functional_annotation/merged_gff/_submission.gff`
+- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/))
+
+More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/).
+
## Mobilome annotation
diff --git a/bin/add_locus_tag_to_trna.py b/bin/add_locus_tag_to_trna.py
index 629b90e..d1889bc 100755
--- a/bin/add_locus_tag_to_trna.py
+++ b/bin/add_locus_tag_to_trna.py
@@ -19,7 +19,7 @@
def main(input_file, output_file):
- with open(input_file, "r") as file_in, open(output_file, "w") as file_out:
+ with open(input_file) as file_in, open(output_file, "w") as file_out:
for line in file_in:
if line.startswith("#") or not line.strip():
# Write header or empty lines as is
diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py
index 8026fc3..3ca7826 100755
--- a/bin/annotate_gff.py
+++ b/bin/annotate_gff.py
@@ -680,7 +680,7 @@ def get_ncrnas(ncrnas_file):
counts += 1
contig = cols[3]
locus = f"{contig}_ncRNA{counts}"
- product = cols[-1]
+ product = " ".join(cols[28:])
model = cols[2]
if model == "RF00005":
# Skip tRNAs, we add them from tRNAscan-SE
@@ -692,6 +692,7 @@ def get_ncrnas(ncrnas_file):
else:
start = int(cols[10])
end = int(cols[9])
+ rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
annot = [
"ID=" + locus,
"inference=Rfam:14.9",
@@ -699,12 +700,14 @@ def get_ncrnas(ncrnas_file):
"product=" + product,
"rfam=" + model,
]
+ if ncrna_class:
+ annot.append(f"ncRNA_class={ncrna_class}")
annot = ";".join(annot)
newline = "\t".join(
[
contig,
"INFERNAL:1.1.4",
- "ncRNA",
+ rna_feature_name,
str(start),
str(end),
".",
@@ -719,6 +722,162 @@ def get_ncrnas(ncrnas_file):
return ncrnas
+def prepare_rna_gff_fields(cols):
+ rna_feature_name = "ncRNA"
+ if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]:
+ rna_feature_name = "rRNA"
+ ncrna_class = ""
+ rna_types = {
+ "antisense_RNA": [
+ "RF00039",
+ "RF00042",
+ "RF00057",
+ "RF00106",
+ "RF00107",
+ "RF00236",
+ "RF00238",
+ "RF00240",
+ "RF00242",
+ "RF00262",
+ "RF00388",
+ "RF00489",
+ "RF01695",
+ "RF01794",
+ "RF01797",
+ "RF01809",
+ "RF01813",
+ "RF02194",
+ "RF02235",
+ "RF02236",
+ "RF02237",
+ "RF02238",
+ "RF02239",
+ "RF02519",
+ "RF02550",
+ "RF02558",
+ "RF02559",
+ "RF02560",
+ "RF02563",
+ "RF02592",
+ "RF02662",
+ "RF02674",
+ "RF02735",
+ "RF02743",
+ "RF02792",
+ "RF02793",
+ "RF02812",
+ "RF02818",
+ "RF02819",
+ "RF02820",
+ "RF02839",
+ "RF02843",
+ "RF02844",
+ "RF02846",
+ "RF02850",
+ "RF02851",
+ "RF02855",
+ "RF02873",
+ "RF02874",
+ "RF02875",
+ "RF02876",
+ "RF02891",
+ "RF02892",
+ "RF02903",
+ "RF02908",
+ ],
+ "autocatalytically_spliced_intron": ["RF01807"],
+ "ribozyme": [
+ "RF00621",
+ "RF01787",
+ "RF01788",
+ "RF01865",
+ "RF02678",
+ "RF02679",
+ "RF02681",
+ "RF02682",
+ "RF02684",
+ "RF03154",
+ "RF03160",
+ "RF04188",
+ ],
+ "hammerhead_ribozyme": [
+ "RF00008",
+ "RF00163",
+ "RF02275",
+ "RF02276",
+ "RF02277",
+ "RF03152",
+ ],
+ "RNase_P_RNA": [
+ "RF00009",
+ "RF00010",
+ "RF00011",
+ "RF00373",
+ "RF01577",
+ "RF02357",
+ ],
+ "RNase_MRP_RNA": ["RF00030", "RF02472"],
+ "telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"],
+ "scaRNA": [
+ "RF00231",
+ "RF00283",
+ "RF00286",
+ "RF00422",
+ "RF00423",
+ "RF00424",
+ "RF00426",
+ "RF00427",
+ "RF00478",
+ "RF00492",
+ "RF00553",
+ "RF00564",
+ "RF00565",
+ "RF00582",
+ "RF00601",
+ "RF00602",
+ "RF01268",
+ "RF01295",
+ "RF02665",
+ "RF02666",
+ "RF02667",
+ "RF02668",
+ "RF02669",
+ "RF02670",
+ "RF02718",
+ "RF02719",
+ "RF02720",
+ "RF02721",
+ "RF02722",
+ ],
+ "snRNA": ["RF01802"],
+ "SRP_RNA": [
+ "RF00017",
+ "RF00169",
+ "RF01502",
+ "RF01570",
+ "RF01854",
+ "RF01855",
+ "RF01856",
+ "RF01857",
+ "RF04183",
+ ],
+ "vault_RNA": ["RF00006"],
+ "Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"],
+ }
+
+ if rna_feature_name == "ncRNA":
+ for rna_type, rfams in rna_types.items():
+ if cols[2] in rfams:
+ ncrna_class = rna_type
+ break
+ if not ncrna_class:
+ if "microRNA" in cols[-1]:
+ ncrna_class = "pre_miRNA"
+ else:
+ ncrna_class = "other"
+ return rna_feature_name, ncrna_class
+
+
def get_trnas(trnas_file):
trnas = {}
with open(trnas_file) as f:
diff --git a/bin/circos_plot.py b/bin/circos_plot.py
index 7f1509e..01c5c41 100755
--- a/bin/circos_plot.py
+++ b/bin/circos_plot.py
@@ -114,7 +114,7 @@ def main(
if "defense_finder_type" in feature.qualifiers:
antiphage_track.genomic_features([feature], fc="orchid")
- elif feature.type in ["tRNA", "ncRNA"]:
+ elif feature.type in ["tRNA", "ncRNA", "rRNA"]:
rna_track.genomic_features([feature], fc="darkmagenta")
elif mobilome and feature.type in [
"mobility_island",
diff --git a/bin/prepare_gff_for_conversion.py b/bin/prepare_gff_for_conversion.py
new file mode 100755
index 0000000..588023c
--- /dev/null
+++ b/bin/prepare_gff_for_conversion.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+
+# Copyright 2023-2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import re
+
+
+def main(infile, outfile):
+ with open(infile) as file_in, open(outfile, "w") as file_out:
+ fasta_flag = False
+ for line in file_in:
+ if line.startswith("##FASTA"):
+ fasta_flag = True
+ file_out.write(line)
+ elif line.startswith("#"):
+ file_out.write(line)
+ elif fasta_flag:
+ file_out.write(line)
+ else:
+ new_line = modify_line(line)
+ file_out.write(new_line + "\n")
+
+
+def modify_line(line):
+ columns = line.strip().split("\t")
+ if columns[2] in ["region", "tRNA"]:
+ return line.strip()
+
+ # Parse the 9th field into key-value pairs
+ attributes_dict = dict(
+ re.split(r"(? 0:
+ text_to_add = ",".join(formatted_list_to_move)
+ if "Dbxref" in attributes_dict:
+ attributes_dict["Dbxref"] = attributes_dict["Dbxref"] + "," + text_to_add
+ else:
+ attributes_dict["Dbxref"] = text_to_add
+ attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",").lstrip(",")
+ return attributes_dict
+
+
+def collect_go_terms(attributes_dict):
+ # Function collects GO terms from across the 9th column into one list and deduplicates them
+ go_terms = list()
+ from_fields = ["Dbxref", "Ontology_term", "uf_ontology_term"]
+ for field in from_fields:
+ if field in attributes_dict:
+ annotations = attributes_dict[field].split(",")
+ for annotation in annotations[
+ :
+ ]: # Using a copy of the list to avoid iteration issues
+ if annotation.startswith("GO:"):
+ go_terms.append(annotation)
+ annotations.remove(annotation)
+ if len(annotations) > 0:
+ modified_annotations = ",".join(annotations)
+ attributes_dict[field] = modified_annotations
+ else:
+ del attributes_dict[field]
+ # deduplicate
+ go_terms = list(set(go_terms))
+ if len(go_terms) > 0:
+ if "Dbxref" in attributes_dict:
+ attributes_dict["Dbxref"] = (
+ attributes_dict["Dbxref"] + "," + ",".join(go_terms)
+ )
+ else:
+ attributes_dict["Dbxref"] = ",".join(go_terms)
+ attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",")
+ return attributes_dict
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="Script processes final GFF to prepare for conversion "
+ "for ENA/GenBank."
+ )
+ parser.add_argument(
+ "-i", "--infile", required=True, help="Path to the GFF file to plot"
+ )
+ parser.add_argument(
+ "-o", "--outfile", required=True, help="Path to the output file"
+ )
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(
+ args.infile,
+ args.outfile,
+ )
diff --git a/conf/base.config b/conf/base.config
index 1ddc6f1..e87751f 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -97,6 +97,8 @@ process {
}
withName: PSEUDOFINDER {
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
- memory = { check_max( 10.GB * task.attempt, 'memory' ) }
+ memory = { check_max( 12.GB * task.attempt, 'memory' ) }
+ errorStrategy = 'retry'
+ maxRetries = 3
}
}
diff --git a/modules/local/annotate_gff.nf b/modules/local/annotate_gff.nf
index 78c8686..6772823 100644
--- a/modules/local/annotate_gff.nf
+++ b/modules/local/annotate_gff.nf
@@ -31,6 +31,7 @@ process ANNOTATE_GFF {
output:
tuple val(meta), path("*_annotations.gff"), emit: annotated_gff
tuple val(meta), path("*_annotations_with_descriptions.gff"), optional: true, emit: annotated_desc_gff
+ tuple val(meta), path("*_submission.gff"), emit: submission_gff
tuple val(meta), path("*_pseudogene_report.txt"), emit: pseudogene_report
path "versions.yml", emit: versions
@@ -105,6 +106,10 @@ process ANNOTATE_GFF {
-i ${hypothetical_tmp_gff} \\
-o ${meta.prefix}_annotations.gff ${hypothetical_ips_flags}
+ prepare_gff_for_conversion.py \\
+ -i ${meta.prefix}_annotations.gff \\
+ -o ${meta.prefix}_submission.gff
+
if [ "${params.fast}" == "false" ]; then
add_interpro_descriptions.py \\
--ipr-entries ${interpro_entry_list}/entry.list \\
@@ -121,6 +126,7 @@ process ANNOTATE_GFF {
stub:
"""
touch ${meta.prefix}_annotations.gff
+ touch ${meta.prefix}_submission.gff
touch ${meta.prefix}_pseudogene_report.txt
if [ "${params.fast}" == "false" ]; then
touch ${meta.prefix}_annotations_with_descriptions.gff