Skip to content

Commit

Permalink
Merge pull request #52 from EBI-Metagenomics/feature/conversion-prep
Browse files Browse the repository at this point in the history
Feature/conversion prep
  • Loading branch information
tgurbich authored Dec 20, 2024
2 parents 2be673a + 3e880e1 commit a6c0d9a
Show file tree
Hide file tree
Showing 7 changed files with 385 additions and 5 deletions.
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- [ Usage ](#usage)
- [ Test ](#test)
- [ Outputs ](#out)
- [Preparing annotations for ENA or GenBank submission](#submission)
- [ Mobilome annotation ](#mobilome)
- [ Credits ](#credit)
- [ Contributions and Support ](#contribute)
Expand Down Expand Up @@ -236,6 +237,8 @@ Reference databases
database for version 4.0 on this ftp location:
ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz
--dbcan_db_version [string] The dbCAN reference database version. [default: 4.1.3_V12]
--pseudofinder_db [string] Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder.
--pseudofinder_db_version [string] SwissProt version. [default: 2024_06]
Generic options
--multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description.
Expand Down Expand Up @@ -461,6 +464,33 @@ The output folders of each individual tool contain select output files of the th

Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output.

<a name="submission"></a>

## Preparing annotations for ENA or GenBank submission

`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `<prefix>_submission.gff` to be used with converters.

### ENA

ENA accepts annotations in the EMBL flat-file format.
Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are:

- the genome FASTA file
- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`

Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details.

### GenBank

To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/).
Three files are required:

- the genome FASTA file
- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`
- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/))

More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/).

<a name="mobilome"></a>

## Mobilome annotation
Expand Down
2 changes: 1 addition & 1 deletion bin/add_locus_tag_to_trna.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


def main(input_file, output_file):
with open(input_file, "r") as file_in, open(output_file, "w") as file_out:
with open(input_file) as file_in, open(output_file, "w") as file_out:
for line in file_in:
if line.startswith("#") or not line.strip():
# Write header or empty lines as is
Expand Down
163 changes: 161 additions & 2 deletions bin/annotate_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ def get_ncrnas(ncrnas_file):
counts += 1
contig = cols[3]
locus = f"{contig}_ncRNA{counts}"
product = cols[-1]
product = " ".join(cols[28:])
model = cols[2]
if model == "RF00005":
# Skip tRNAs, we add them from tRNAscan-SE
Expand All @@ -692,19 +692,22 @@ def get_ncrnas(ncrnas_file):
else:
start = int(cols[10])
end = int(cols[9])
rna_feature_name, ncrna_class = prepare_rna_gff_fields(cols)
annot = [
"ID=" + locus,
"inference=Rfam:14.9",
"locus_tag=" + locus,
"product=" + product,
"rfam=" + model,
]
if ncrna_class:
annot.append(f"ncRNA_class={ncrna_class}")
annot = ";".join(annot)
newline = "\t".join(
[
contig,
"INFERNAL:1.1.4",
"ncRNA",
rna_feature_name,
str(start),
str(end),
".",
Expand All @@ -719,6 +722,162 @@ def get_ncrnas(ncrnas_file):
return ncrnas


def prepare_rna_gff_fields(cols):
rna_feature_name = "ncRNA"
if cols[1] in ["LSU_rRNA_bacteria", "SSU_rRNA_bacteria", "5S_rRNA"]:
rna_feature_name = "rRNA"
ncrna_class = ""
rna_types = {
"antisense_RNA": [
"RF00039",
"RF00042",
"RF00057",
"RF00106",
"RF00107",
"RF00236",
"RF00238",
"RF00240",
"RF00242",
"RF00262",
"RF00388",
"RF00489",
"RF01695",
"RF01794",
"RF01797",
"RF01809",
"RF01813",
"RF02194",
"RF02235",
"RF02236",
"RF02237",
"RF02238",
"RF02239",
"RF02519",
"RF02550",
"RF02558",
"RF02559",
"RF02560",
"RF02563",
"RF02592",
"RF02662",
"RF02674",
"RF02735",
"RF02743",
"RF02792",
"RF02793",
"RF02812",
"RF02818",
"RF02819",
"RF02820",
"RF02839",
"RF02843",
"RF02844",
"RF02846",
"RF02850",
"RF02851",
"RF02855",
"RF02873",
"RF02874",
"RF02875",
"RF02876",
"RF02891",
"RF02892",
"RF02903",
"RF02908",
],
"autocatalytically_spliced_intron": ["RF01807"],
"ribozyme": [
"RF00621",
"RF01787",
"RF01788",
"RF01865",
"RF02678",
"RF02679",
"RF02681",
"RF02682",
"RF02684",
"RF03154",
"RF03160",
"RF04188",
],
"hammerhead_ribozyme": [
"RF00008",
"RF00163",
"RF02275",
"RF02276",
"RF02277",
"RF03152",
],
"RNase_P_RNA": [
"RF00009",
"RF00010",
"RF00011",
"RF00373",
"RF01577",
"RF02357",
],
"RNase_MRP_RNA": ["RF00030", "RF02472"],
"telomerase_RNA": ["RF00024", "RF00025", "RF01050", "RF02462"],
"scaRNA": [
"RF00231",
"RF00283",
"RF00286",
"RF00422",
"RF00423",
"RF00424",
"RF00426",
"RF00427",
"RF00478",
"RF00492",
"RF00553",
"RF00564",
"RF00565",
"RF00582",
"RF00601",
"RF00602",
"RF01268",
"RF01295",
"RF02665",
"RF02666",
"RF02667",
"RF02668",
"RF02669",
"RF02670",
"RF02718",
"RF02719",
"RF02720",
"RF02721",
"RF02722",
],
"snRNA": ["RF01802"],
"SRP_RNA": [
"RF00017",
"RF00169",
"RF01502",
"RF01570",
"RF01854",
"RF01855",
"RF01856",
"RF01857",
"RF04183",
],
"vault_RNA": ["RF00006"],
"Y_RNA": ["RF00019", "RF02553", "RF01053", "RF02565"],
}

if rna_feature_name == "ncRNA":
for rna_type, rfams in rna_types.items():
if cols[2] in rfams:
ncrna_class = rna_type
break
if not ncrna_class:
if "microRNA" in cols[-1]:
ncrna_class = "pre_miRNA"
else:
ncrna_class = "other"
return rna_feature_name, ncrna_class


def get_trnas(trnas_file):
trnas = {}
with open(trnas_file) as f:
Expand Down
2 changes: 1 addition & 1 deletion bin/circos_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def main(
if "defense_finder_type" in feature.qualifiers:
antiphage_track.genomic_features([feature], fc="orchid")

elif feature.type in ["tRNA", "ncRNA"]:
elif feature.type in ["tRNA", "ncRNA", "rRNA"]:
rna_track.genomic_features([feature], fc="darkmagenta")
elif mobilome and feature.type in [
"mobility_island",
Expand Down
Loading

0 comments on commit a6c0d9a

Please sign in to comment.