diff --git a/README.md b/README.md index 066292d..da773c3 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ - [ Usage ](#usage) - [ Test ](#test) - [ Outputs ](#out) +- [Preparing annotations for ENA or GenBank submission](#submission) - [ Mobilome annotation ](#mobilome) - [ Credits ](#credit) - [ Contributions and Support ](#contribute) @@ -236,6 +237,8 @@ Reference databases database for version 4.0 on this ftp location: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz --dbcan_db_version [string] The dbCAN reference database version. [default: 4.1.3_V12] + --pseudofinder_db [string] Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder. + --pseudofinder_db_version [string] SwissProt version. [default: 2024_06] Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. @@ -461,6 +464,33 @@ The output folders of each individual tool contain select output files of the th Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output. + + +## Preparing annotations for ENA or GenBank submission + +`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `_submission.gff` to be used with converters. + +### ENA + +ENA accepts annotations in the EMBL flat-file format. +Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are: + +- the genome FASTA file +- `//functional_annotation/merged_gff/_submission.gff` + +Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details. + +### GenBank + +To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/). +Three files are required: + +- the genome FASTA file +- `//functional_annotation/merged_gff/_submission.gff` +- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/)) + +More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/). + ## Mobilome annotation diff --git a/bin/add_locus_tag_to_trna.py b/bin/add_locus_tag_to_trna.py index 629b90e..d1889bc 100755 --- a/bin/add_locus_tag_to_trna.py +++ b/bin/add_locus_tag_to_trna.py @@ -19,7 +19,7 @@ def main(input_file, output_file): - with open(input_file, "r") as file_in, open(output_file, "w") as file_out: + with open(input_file) as file_in, open(output_file, "w") as file_out: for line in file_in: if line.startswith("#") or not line.strip(): # Write header or empty lines as is diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py index 8026fc3..3ca7826 100755 --- a/bin/annotate_gff.py +++ b/bin/annotate_gff.py @@ -680,7 +680,7 @@ def get_ncrnas(ncrnas_file): counts += 1 contig = cols[3] locus = f"{contig}_ncRNA{counts}" - product = cols[-1] + product = " ".join(cols[28:]) model = cols[2] if model == "RF00005": # Skip tRNAs, we add them from tRNAscan-SE @@ -692,6 +692,7 @@ def get_ncrnas(ncrnas_file): else: start = int(cols[10]) end = int(cols[9]) + rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols) annot = [ "ID=" + locus, "inference=Rfam:14.9", @@ -699,12 +700,14 @@ def get_ncrnas(ncrnas_file): "product=" + product, "rfam=" + model, ] + if ncrna_class: + annot.append(f"ncRNA_class={ncrna_class}") annot = ";".join(annot) newline = "\t".join( [ contig, "INFERNAL:1.1.4", - "ncRNA", + rna_feature_name, str(start), str(end), ".", @@ -719,6 +722,162 @@ def get_ncrnas(ncrnas_file): return ncrnas +def prepare_rna_gff_fields(cols): + rna_feature_name = "ncRNA" + if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]: + rna_feature_name = "rRNA" + ncrna_class = "" + rna_types = { + "antisense_RNA": [ + "RF00039", + "RF00042", + "RF00057", + "RF00106", + "RF00107", + "RF00236", + "RF00238", + "RF00240", + "RF00242", + "RF00262", + "RF00388", + "RF00489", + "RF01695", + "RF01794", + "RF01797", + "RF01809", + "RF01813", + "RF02194", + "RF02235", + "RF02236", + "RF02237", + "RF02238", + "RF02239", + "RF02519", + "RF02550", + "RF02558", + "RF02559", + "RF02560", + "RF02563", + "RF02592", + "RF02662", + "RF02674", + "RF02735", + "RF02743", + "RF02792", + "RF02793", + "RF02812", + "RF02818", + "RF02819", + "RF02820", + "RF02839", + "RF02843", + "RF02844", + "RF02846", + "RF02850", + "RF02851", + "RF02855", + "RF02873", + "RF02874", + "RF02875", + "RF02876", + "RF02891", + "RF02892", + "RF02903", + "RF02908", + ], + "autocatalytically_spliced_intron": ["RF01807"], + "ribozyme": [ + "RF00621", + "RF01787", + "RF01788", + "RF01865", + "RF02678", + "RF02679", + "RF02681", + "RF02682", + "RF02684", + "RF03154", + "RF03160", + "RF04188", + ], + "hammerhead_ribozyme": [ + "RF00008", + "RF00163", + "RF02275", + "RF02276", + "RF02277", + "RF03152", + ], + "RNase_P_RNA": [ + "RF00009", + "RF00010", + "RF00011", + "RF00373", + "RF01577", + "RF02357", + ], + "RNase_MRP_RNA": ["RF00030", "RF02472"], + "telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"], + "scaRNA": [ + "RF00231", + "RF00283", + "RF00286", + "RF00422", + "RF00423", + "RF00424", + "RF00426", + "RF00427", + "RF00478", + "RF00492", + "RF00553", + "RF00564", + "RF00565", + "RF00582", + "RF00601", + "RF00602", + "RF01268", + "RF01295", + "RF02665", + "RF02666", + "RF02667", + "RF02668", + "RF02669", + "RF02670", + "RF02718", + "RF02719", + "RF02720", + "RF02721", + "RF02722", + ], + "snRNA": ["RF01802"], + "SRP_RNA": [ + "RF00017", + "RF00169", + "RF01502", + "RF01570", + "RF01854", + "RF01855", + "RF01856", + "RF01857", + "RF04183", + ], + "vault_RNA": ["RF00006"], + "Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"], + } + + if rna_feature_name == "ncRNA": + for rna_type, rfams in rna_types.items(): + if cols[2] in rfams: + ncrna_class = rna_type + break + if not ncrna_class: + if "microRNA" in cols[-1]: + ncrna_class = "pre_miRNA" + else: + ncrna_class = "other" + return rna_feature_name, ncrna_class + + def get_trnas(trnas_file): trnas = {} with open(trnas_file) as f: diff --git a/bin/circos_plot.py b/bin/circos_plot.py index 7f1509e..01c5c41 100755 --- a/bin/circos_plot.py +++ b/bin/circos_plot.py @@ -114,7 +114,7 @@ def main( if "defense_finder_type" in feature.qualifiers: antiphage_track.genomic_features([feature], fc="orchid") - elif feature.type in ["tRNA", "ncRNA"]: + elif feature.type in ["tRNA", "ncRNA", "rRNA"]: rna_track.genomic_features([feature], fc="darkmagenta") elif mobilome and feature.type in [ "mobility_island", diff --git a/bin/prepare_gff_for_conversion.py b/bin/prepare_gff_for_conversion.py new file mode 100755 index 0000000..588023c --- /dev/null +++ b/bin/prepare_gff_for_conversion.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import re + + +def main(infile, outfile): + with open(infile) as file_in, open(outfile, "w") as file_out: + fasta_flag = False + for line in file_in: + if line.startswith("##FASTA"): + fasta_flag = True + file_out.write(line) + elif line.startswith("#"): + file_out.write(line) + elif fasta_flag: + file_out.write(line) + else: + new_line = modify_line(line) + file_out.write(new_line + "\n") + + +def modify_line(line): + columns = line.strip().split("\t") + if columns[2] in ["region", "tRNA"]: + return line.strip() + + # Parse the 9th field into key-value pairs + attributes_dict = dict( + re.split(r"(? 0: + text_to_add = ",".join(formatted_list_to_move) + if "Dbxref" in attributes_dict: + attributes_dict["Dbxref"] = attributes_dict["Dbxref"] + "," + text_to_add + else: + attributes_dict["Dbxref"] = text_to_add + attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",").lstrip(",") + return attributes_dict + + +def collect_go_terms(attributes_dict): + # Function collects GO terms from across the 9th column into one list and deduplicates them + go_terms = list() + from_fields = ["Dbxref", "Ontology_term", "uf_ontology_term"] + for field in from_fields: + if field in attributes_dict: + annotations = attributes_dict[field].split(",") + for annotation in annotations[ + : + ]: # Using a copy of the list to avoid iteration issues + if annotation.startswith("GO:"): + go_terms.append(annotation) + annotations.remove(annotation) + if len(annotations) > 0: + modified_annotations = ",".join(annotations) + attributes_dict[field] = modified_annotations + else: + del attributes_dict[field] + # deduplicate + go_terms = list(set(go_terms)) + if len(go_terms) > 0: + if "Dbxref" in attributes_dict: + attributes_dict["Dbxref"] = ( + attributes_dict["Dbxref"] + "," + ",".join(go_terms) + ) + else: + attributes_dict["Dbxref"] = ",".join(go_terms) + attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",") + return attributes_dict + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Script processes final GFF to prepare for conversion " + "for ENA/GenBank." + ) + parser.add_argument( + "-i", "--infile", required=True, help="Path to the GFF file to plot" + ) + parser.add_argument( + "-o", "--outfile", required=True, help="Path to the output file" + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main( + args.infile, + args.outfile, + ) diff --git a/conf/base.config b/conf/base.config index 1ddc6f1..e87751f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -97,6 +97,8 @@ process { } withName: PSEUDOFINDER { cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 10.GB * task.attempt, 'memory' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + errorStrategy = 'retry' + maxRetries = 3 } } diff --git a/modules/local/annotate_gff.nf b/modules/local/annotate_gff.nf index 78c8686..6772823 100644 --- a/modules/local/annotate_gff.nf +++ b/modules/local/annotate_gff.nf @@ -31,6 +31,7 @@ process ANNOTATE_GFF { output: tuple val(meta), path("*_annotations.gff"), emit: annotated_gff tuple val(meta), path("*_annotations_with_descriptions.gff"), optional: true, emit: annotated_desc_gff + tuple val(meta), path("*_submission.gff"), emit: submission_gff tuple val(meta), path("*_pseudogene_report.txt"), emit: pseudogene_report path "versions.yml", emit: versions @@ -105,6 +106,10 @@ process ANNOTATE_GFF { -i ${hypothetical_tmp_gff} \\ -o ${meta.prefix}_annotations.gff ${hypothetical_ips_flags} + prepare_gff_for_conversion.py \\ + -i ${meta.prefix}_annotations.gff \\ + -o ${meta.prefix}_submission.gff + if [ "${params.fast}" == "false" ]; then add_interpro_descriptions.py \\ --ipr-entries ${interpro_entry_list}/entry.list \\ @@ -121,6 +126,7 @@ process ANNOTATE_GFF { stub: """ touch ${meta.prefix}_annotations.gff + touch ${meta.prefix}_submission.gff touch ${meta.prefix}_pseudogene_report.txt if [ "${params.fast}" == "false" ]; then touch ${meta.prefix}_annotations_with_descriptions.gff