diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 01f5bc17..93a1003e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,8 +30,8 @@ jobs: - name: Check out pipeline code uses: actions/checkout@v3 - - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + - name: Set up Nextflow + uses: nf-core/setup-nextflow@v2 with: version: "${{ matrix.NXF_VER }}" diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 1fcafe88..1282c673 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -44,7 +44,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core + pip install nf-core==2.14.1 - name: Run nf-core lint env: diff --git a/.github/workflows/pytest_ci.yml b/.github/workflows/pytest_ci.yml new file mode 100644 index 00000000..98008834 --- /dev/null +++ b/.github/workflows/pytest_ci.yml @@ -0,0 +1,32 @@ +name: Python Tests + +on: + push: + branches: [main] + pull_request: + branches: [main, dev] + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.9] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Run tests + run: | + pytest diff --git a/.gitignore b/.gitignore index 175fdca1..d3523a85 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,46 @@ +# Ignore Nextflow-specific files, including logs, cache, and temporary files .nextflow* + +# Ignore Nextflow's work directory where intermediate files are stored work/ -data/ + +# Ignore results directory, as this is typically generated output that can be large and recreated results/ + +# MacOS specific hidden file that stores folder view settings .DS_Store + testing/ testing* -*.pyc + dbs/ + +node_modules/ + +# Optional: ignore any temporary files created by Python or text editors +__pycache__/ # Python cache directory +*.pyc +*.pyo # Python optimized bytecode files +*.pkl # Pickle files (often generated during data processing) + +# Ignore any virtual environment directories used to isolate Python dependencies +venv/ +env/ +*.venv/ + +# Ignore Jupyter Notebook checkpoints, if notebooks are used for analysis or reporting +.ipynb_checkpoints/ + +# Ignore any temporary, swap, or backup files created by editors like Vim or Emacs +*~ +*.swp +*.swo +*.bak + +.coverage +htmlcov/ +*.cover +reports/ +trace/ +.cache/ +logs/ diff --git a/.nf-core.yml b/.nf-core.yml index 1f060a16..58700dad 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,8 @@ repository_type: pipeline +org_path: ebi-metagenomics +template: + prefix: ebi-metagenomics + - github_badges lint: files_exist: - CODE_OF_CONDUCT.md @@ -26,10 +30,12 @@ lint: - docs/images/nf-core-mettannotator_logo_light.png - docs/images/nf-core-mettannotator_logo_dark.png - .github/ISSUE_TEMPLATE/bug_report.yml - nextflow_config: False - - params.custom_config_version - - params.custom_config_base - - manifest.name - - manifest.homePage + - .github/workflows/linting.yml + - .github/CONTRIBUTING.md + - .github/workflows/branch.yml + - .github/workflows/linting_comment.yml + - .github/PULL_REQUEST_TEMPLATE.md + - .gitignore + nextflow_config: false multiqc_config: - report_comment diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c31cdb9..890027e0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,20 @@ repos: - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v2.7.1" + rev: "v4.0.0-alpha.8" hooks: - id: prettier + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.8 + hooks: + - id: ruff + args: [--fix] diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 00000000..87ea8418 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,12 @@ +line-length = 120 +target-version = "py38" +cache-dir = "~/.cache/ruff" + +[lint] +select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] + +[lint.isort] +known-first-party = ["nf_core"] + +[lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/CITATIONS.md b/CITATIONS.md index a115ddce..e4c55605 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -62,6 +62,10 @@ > Sanchez S, Rogers JD, Rogers AB, Nassar M, McEntyre J, Welch M, Hollfelder F, Finn RD. Expansion of novel biosynthetic gene clusters from diverse environments using SanntiS. bioRxiv 2023.05.23.540769; doi: https://doi.org/10.1101/2023.05.23.540769 +- [Pseudofinder](https://doi.org/10.1093/molbev/msac153) + + > Syberg-Olsen MJ, Garber AI, Keeling PJ, McCutcheon JP, Husnik F. Pseudofinder: Detection of Pseudogenes in Prokaryotic Genomes. Mol Biol Evol. 2022 Jul 2;39(7):msac153. doi: 10.1093/molbev/msac153. PMID: 35801562; PMCID: PMC9336565. + - [run_dbCAN](https://pubmed.ncbi.nlm.nih.gov/37125649/) > Zheng J, Ge Q, Yan Y, Zhang X, Huang L, Yin Y. dbCAN3: automated carbohydrate-active enzyme and substrate annotation. Nucleic Acids Res. 2023 Jul 5;51(W1):W115-W121. doi: 10.1093/nar/gkad328. PMID: 37125649; PMCID: PMC10320055. diff --git a/README.md b/README.md index dce9f002..da773c3c 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ - [ Usage ](#usage) - [ Test ](#test) - [ Outputs ](#out) +- [Preparing annotations for ENA or GenBank submission](#submission) - [ Mobilome annotation ](#mobilome) - [ Credits ](#credit) - [ Contributions and Support ](#contribute) @@ -39,6 +40,8 @@ The workflow uses the following tools and databases: | [Prokka](https://github.com/tseemann/prokka) | 1.14.6 | CDS calling and functional annotation (default) | | [Bakta](https://github.com/oschwengers/bakta) | 1.9.3 | CDS calling and functional annotation (if --bakta flag is used) | | [Bakta db](https://zenodo.org/record/10522951/) | 2024-01-19 with AMRFinderPlus DB 2024-01-31.1 | Bakta DB (when Bakta is used as the gene caller) | +| [Pseudofinder](https://github.com/filip-husnik/pseudofinder) | v1.1.0 | Identification of possible pseudogenes | +| [Swiss-Prot](https://www.uniprot.org/help/downloads) | 2024_06 | Database for Pseudofinder | | [InterProScan](https://www.ebi.ac.uk/interpro/about/interproscan/) | 5.62-94.0 | Protein annotation (InterPro, Pfam) | | [eggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper) | 2.1.11 | Protein annotation (eggNOG, KEGG, COG, GO-terms) | | [eggNOG DB](http://eggnog6.embl.de/download/) | 5.0.2 | Database for eggNOG-mapper | @@ -89,7 +92,8 @@ The pipeline needs reference databases in order to work, they take roughly 180G. | interproscan | 45G | | interpro_entry_list | 2.6M | | rfam_models | 637M | -| total | 180G | +| pseudofinder | 273M | +| total | 182G | `mettannotator` has an automated mechanism to download the databases using the `--dbs ` flag. When this flag is provided, the pipeline inspects the folder to verify if the required databases are already present. If any of the databases are missing, the pipeline will automatically download them. @@ -177,8 +181,12 @@ Note, that by default the script uses FASTA file names as prefixes and truncates Running `mettannotator` with the `--help` option will pull the repository and display the help message: +> [!NOTE] +> We use the `-latest` flag with the `nextflow run` command, which ensures that the latest available version of the pipeline is pulled. +> If you encounter any issues with the `nextflow run` command, please refer to the [Nextflow documentation](https://www.nextflow.io/docs/latest/reference/cli.html#run). + ```angular2html -nextflow run ebi-metagenomics/mettannotator/main.nf --help +$ nextflow run -latest ebi-metagenomics/mettannotator/main.nf --help N E X T F L O W ~ version 23.04.3 Launching `mettannotator/main.nf` [disturbed_davinci] DSL2 - revision: f2a0e51af6 @@ -229,6 +237,8 @@ Reference databases database for version 4.0 on this ftp location: ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz --dbcan_db_version [string] The dbCAN reference database version. [default: 4.1.3_V12] + --pseudofinder_db [string] Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder. + --pseudofinder_db_version [string] SwissProt version. [default: 2024_06] Generic options --multiqc_methods_description [string] Custom MultiQC yaml file containing HTML including a methods description. @@ -259,17 +269,25 @@ nextflow run ebi-metagenomics/mettannotator \ --dbs ``` -> **Warning:** +> [!WARNING] > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +#### Running the pipeline from the source code + +If the Nextflow integration with Git does not work, users can download the tarball from the releases page. After extracting the tarball, the pipeline can be run directly by executing the following command: + +```bash +$ nextflow run path-to-source-code/main.nf --help +``` + #### Local execution The pipeline can be run on a desktop or laptop, with the caveat that it will take a few hours to complete depending on the resources. There is a local profile in the Nextflow config that limits the total resources the pipeline can use to 8 cores and 12 GB of RAM. In order to run it (Docker or Singularity are still required): ```bash -nextflow run ebi-metagenomics/mettannotator \ +nextflow run -latest ebi-metagenomics/mettannotator \ -profile local, \ --input assemblies_sheet.csv \ --outdir \ @@ -302,7 +320,7 @@ To run the pipeline using a test dataset, execute the following command: ```bash wget https://raw.githubusercontent.com/EBI-Metagenomics/mettannotator/master/tests/test.csv -nextflow run ebi-metagenomics/mettannotator \ +nextflow run -latest ebi-metagenomics/mettannotator \ -profile \ --input test.csv \ --outdir \ @@ -331,6 +349,7 @@ The output folder structure will look as follows: │ ├─interproscan │ ├─merged_gff │ ├─prokka + │ ├─pseudofinder │ └─unifire ├─mobilome │ └─crisprcas_finder @@ -429,12 +448,49 @@ The following logic is used by `mettannotator` to fill out the `product` field i If the pipeline is executed with the `--fast` flag, only the output of eggNOG-mapper is used to determine the product of proteins that were labeled as hypothetical by the gene caller. +#### Detection of pseudogenes and spurious ORFs + +`mettannotator` uses several approaches to detect pseudogenes and spurious ORFs: + +- If Bakta is used as the initial annotation tool, `mettannotator` will inherit the pseudogene labels assigned by Bakta. +- `mettannotator` runs Pseudofinder and labels genes that Pseudofinder predicts to be pseudogenes by adding `"pseudo=true"` to the 9th column of the final merged GFF file. If there is a disagreement between Pseudofinder and Bakta and one of the tools calls a gene a pseudogene, it will be labeled as a pseudogene. +- AntiFam, which is a part of InterPro, is used to identify potential spurious ORFs. If an ORF has an AntiFam hit, `mettannotator` will remove it from the final merged GFF file. These ORFs will still appear in the raw outputs of Bakta/Prokka and may appear in other tool outputs. + +`mettannotator` produces a report file which is located in the `merged_gff` folder and includes a list of CDS with AntiFam hits and pseudogenes. For each pseudogene, the report shows which tool predicted it. + ### Contents of the tool output folders The output folders of each individual tool contain select output files of the third-party tools used by `mettannotator`. For file descriptions, please refer to the tool documentation. For some tools that don't output a GFF, `mettannotator` converts the output into a GFF. Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output. + + +## Preparing annotations for ENA or GenBank submission + +`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `_submission.gff` to be used with converters. + +### ENA + +ENA accepts annotations in the EMBL flat-file format. +Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are: + +- the genome FASTA file +- `//functional_annotation/merged_gff/_submission.gff` + +Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details. + +### GenBank + +To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/). +Three files are required: + +- the genome FASTA file +- `//functional_annotation/merged_gff/_submission.gff` +- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/)) + +More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/). + ## Mobilome annotation diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bin/add_hypothetical_protein_descriptions.py b/bin/add_hypothetical_protein_descriptions.py index f4d9157f..ea93ef34 100755 --- a/bin/add_hypothetical_protein_descriptions.py +++ b/bin/add_hypothetical_protein_descriptions.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2024 EMBL - European Bioinformatics Institute # @@ -19,6 +18,7 @@ import logging import re import sys +from enum import Enum logging.basicConfig(level=logging.INFO) @@ -28,29 +28,41 @@ MINIMUM_IPR_MATCH = 0.10 +class GeneCaller(Enum): + PROKKA = "Prokka" + BAKTA = "Bakta" + + def main(ipr_types_file, ipr_file, hierarchy_file, eggnog_file, infile, outfile): eggnog_info = load_eggnog(eggnog_file) if ipr_file: levels = load_hierarchy(hierarchy_file) ipr_types = load_ipr_types(ipr_types_file) - ipr_info, ipr_memberdb_only, ipr_leveled_info = load_ipr( - ipr_file, ipr_types, levels - ) + ipr_info, ipr_memberdb_only, _ = load_ipr(ipr_file, ipr_types, levels) else: ipr_info = dict() ipr_memberdb_only = dict() - gene_caller = "Prokka" + + gene_caller = GeneCaller.PROKKA fasta_flag = False - with open(infile, "r") as file_in, open(outfile, "w") as file_out: + with open(infile) as file_in, open(outfile, "w") as file_out: for line in file_in: if not fasta_flag: if line.startswith("##FASTA"): fasta_flag = True file_out.write(line) elif not line.startswith("#"): - contig, tool, feature, start, end, blank1, strand, blank2, col9 = ( - line.strip().split("\t") - ) + ( + contig, + tool, + feature, + start, + end, + blank1, + strand, + blank2, + col9, + ) = line.strip().split("\t") if feature == "CDS": attributes_dict = dict( re.split(r"(? best_level or ( @@ -524,13 +549,13 @@ def pull_out_description(my_dict, first_priority, second_priority): def load_eggnog(file): eggnog_info = dict() - with open(file, "r") as file_in: + with open(file) as file_in: for line in file_in: if not line.startswith("#"): cols = line.strip().split("\t") try: evalue = float(cols[2]) - except: + except: # noqa: E722 continue if evalue > EVALUE_CUTOFF: continue @@ -751,7 +776,7 @@ def clean_up_eggnog_function(func_description): def load_ipr_types(ipr_types_file): ipr_types = dict() - with open(ipr_types_file, "r") as file_in: + with open(ipr_types_file) as file_in: for line in file_in: if line.startswith("IPR"): acc, type, _ = line.strip().split("\t") @@ -764,7 +789,7 @@ def load_ipr(file, ipr_types, ipr_levels): ipr_leveled_info = dict() ipr_memberdb_only = dict() # hit only exists in a member database - with open(file, "r") as file_in: + with open(file) as file_in: for line in file_in: cols = line.strip().split("\t") ( @@ -900,7 +925,7 @@ def escape_reserved_characters(string): if ch == ";": string = string.replace(ch, "/") else: - string = string.replace(ch, "\{}".format(ch)) + string = string.replace(ch, f"\{ch}") return string diff --git a/bin/add_interpro_descriptions.py b/bin/add_interpro_descriptions.py index 5fa601d7..7ab433f0 100755 --- a/bin/add_interpro_descriptions.py +++ b/bin/add_interpro_descriptions.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2024 EMBL - European Bioinformatics Institute # @@ -21,7 +20,7 @@ def main(ipr_types_file, infile, outfile): ipr_types_and_descriptions = load_ipr(ipr_types_file) - with open(infile, "r") as file_in, open(outfile, "w") as file_out: + with open(infile) as file_in, open(outfile, "w") as file_out: fasta_flag = False for line in file_in: if not fasta_flag: @@ -108,7 +107,7 @@ def load_ipr(ipr_types_file): "PTM": "S", "Repeat": "R", } - with open(ipr_types_file, "r") as file_in: + with open(ipr_types_file) as file_in: for line in file_in: if line.startswith("IPR"): acc, ipr_type, desc = line.strip().split("\t") diff --git a/bin/add_locus_tag_to_trna.py b/bin/add_locus_tag_to_trna.py new file mode 100755 index 00000000..d1889bc0 --- /dev/null +++ b/bin/add_locus_tag_to_trna.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# Copyright 2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import re + + +def main(input_file, output_file): + with open(input_file) as file_in, open(output_file, "w") as file_out: + for line in file_in: + if line.startswith("#") or not line.strip(): + # Write header or empty lines as is + file_out.write(line) + continue + + columns = line.strip().split("\t") + if len(columns) < 9: + file_out.write(line) + continue + + if columns[2] != "tRNA": + file_out.write(line) + continue + + # Parse the attributes (column 9) + attributes = columns[8].rstrip(";") + attributes_dict = dict( + re.split(r"(? 0: + logging.info("The following genes were only identified in the standard GFF:") + for gene in unique_to_standard: + logging.info(gene) + if len(unique_to_compliant) > 0: + logging.info("The following genes were only identified in the compliant GFF:") + for gene in unique_to_compliant: + logging.info(gene) + return name_translation + + +def load_gff(gff_file, chromosome_dictionary): + genes = dict() + with open(gff_file) as file_in: + for line in file_in: + if line.startswith("##FASTA"): + return genes + elif line.startswith("#"): + continue + else: + chromosome, _, feature, start, end, _, strand, _, col9 = ( + line.strip().split("\t") + ) + if "|" in chromosome: + chromosome = chromosome.split("|")[-1] + if chromosome in chromosome_dictionary: + chromosome = chromosome_dictionary[chromosome] + if feature == "CDS": + attributes_dict = dict( + re.split(r"(? 1e-10: continue + if cols[3] == "AntiFam": + antifams.append(protein) + continue if protein not in iprs: iprs[protein] = [set(), set()] if cols[3] == "Pfam": @@ -128,12 +159,12 @@ def get_iprs(ipr_annot): ipr = cols[11] if not ipr == "-": iprs[protein][1].add(ipr) - return iprs + return iprs, antifams def get_eggnog(eggnog_annot): eggnogs = {} - with open(eggnog_annot, "r") as f: + with open(eggnog_annot) as f: for line in f: line = line.rstrip() cols = line.split("\t") @@ -183,7 +214,7 @@ def get_bgcs(bgc_file, prokka_gff, tool): return bgc_annotations # save positions of each BGC cluster to dictionary cluster_positions # and save the annotations to dictionary bgc_result - with open(bgc_file, "r") as bgc_in: + with open(bgc_file) as bgc_in: for line in bgc_in: if not line.startswith("#"): ( @@ -245,7 +276,7 @@ def get_bgcs(bgc_file, prokka_gff, tool): {"bgc_function": type_value}, ) # identify CDSs that fall into each of the clusters annotated by the BGC tool - with open(prokka_gff, "r") as gff_in: + with open(prokka_gff) as gff_in: for line in gff_in: if not line.startswith("#"): matching_interval = "" @@ -309,7 +340,7 @@ def get_amr(amr_file): amr_annotations = {} if not amr_file: return amr_annotations - with open(amr_file, "r") as f: + with open(amr_file) as f: for line in f: if line.startswith("Protein identifier"): continue @@ -339,13 +370,13 @@ def get_amr(amr_file): seq_name = seq_name.replace("=", " ") amr_annotations[protein_id] = ";".join( [ - "amrfinderplus_gene_symbol={}".format(gene_name), - "amrfinderplus_sequence_name={}".format(seq_name), - "amrfinderplus_scope={}".format(scope), - "element_type={}".format(element_type), - "element_subtype={}".format(element_subtype), - "drug_class={}".format(drug_class), - "drug_subclass={}".format(drug_subclass), + f"amrfinderplus_gene_symbol={gene_name}", + f"amrfinderplus_sequence_name={seq_name}", + f"amrfinderplus_scope={scope}", + f"element_type={element_type}", + f"element_subtype={element_subtype}", + f"drug_class={drug_class}", + f"drug_subclass={drug_subclass}", ] ) return amr_annotations @@ -356,7 +387,7 @@ def get_dbcan(dbcan_file): substrates = dict() if not dbcan_file: return dbcan_annotations - with open(dbcan_file, "r") as f: + with open(dbcan_file) as f: for line in f: if "predicted PUL" in line: annot_fields = line.strip().split("\t")[8].split(";") @@ -399,7 +430,7 @@ def get_defense_finder(df_file): type_info = dict() if not df_file: return defense_finder_annotations - with open(df_file, "r") as f: + with open(df_file) as f: for line in f: if "Anti-phage system" in line: annot_fields = line.strip().split("\t")[8].split(";") @@ -427,6 +458,29 @@ def get_defense_finder(df_file): return defense_finder_annotations +def get_pseudogenes(pseudofinder_file): + pseudogenes = dict() + if not pseudofinder_file: + return pseudogenes + with open(pseudofinder_file) as file_in: + for line in file_in: + if not line.startswith("#"): + col9 = line.strip().split("\t")[8] + attributes_dict = dict( + re.split(r"(? contig_num_limit: logging.info( - "Skipping plot generation for file {} due to a large number of contigs: {}. " - "Plots are only generated for genomes with up to {} annotated contigs.".format( - infile, len(seqid2size), contig_num_limit - ) + f"Skipping plot generation for file {infile} due to a large number of contigs: {len(seqid2size)}. " + f"Plots are only generated for genomes with up to {contig_num_limit} annotated contigs." ) sys.exit() @@ -30,7 +28,7 @@ def main(infile, outfile, prefix, contig_num_limit, contig_trim, mobilome, skip_ circos = Circos(seqid2size, space=1, start=1, end=358) - circos.text("{}\n".format(prefix), size=15, r=30) + circos.text(f"{prefix}\n", size=15, r=30) # Skip printing contig names if the names are too long print_contigs = True @@ -116,7 +114,7 @@ def main(infile, outfile, prefix, contig_num_limit, contig_trim, mobilome, skip_ if "defense_finder_type" in feature.qualifiers: antiphage_track.genomic_features([feature], fc="orchid") - elif feature.type in ["tRNA", "ncRNA"]: + elif feature.type in ["tRNA", "ncRNA", "rRNA"]: rna_track.genomic_features([feature], fc="darkmagenta") elif mobilome and feature.type in [ "mobility_island", @@ -164,7 +162,7 @@ def main(infile, outfile, prefix, contig_num_limit, contig_trim, mobilome, skip_ def remove_escaped_characters(infile): outfile = infile + "_modified" - with open(infile, "r") as file_in: + with open(infile) as file_in: content = file_in.read() modified_content = content.replace("\\=", "") diff --git a/bin/convert_cds_into_multiple_lines.py b/bin/convert_cds_into_multiple_lines.py index e4a8ad80..2a91a902 100755 --- a/bin/convert_cds_into_multiple_lines.py +++ b/bin/convert_cds_into_multiple_lines.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -79,7 +78,7 @@ def edit_id_add_parent(attributes_dict, id, parent=None): # Read the input GFF file and process the entries output_entries = [] - with open(input_file, "r") as infile: + with open(input_file) as infile: for line in infile: entry = line.strip().split("\t") new_entries = split_cds_to_gene_exon_mrna(entry) diff --git a/bin/identify_kingdom.py b/bin/identify_kingdom.py index df790318..462bac11 100755 --- a/bin/identify_kingdom.py +++ b/bin/identify_kingdom.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2023-2024 EMBL - European Bioinformatics Institute # @@ -29,9 +28,9 @@ def main(taxid, include_kingdom, outfile): kingdom = "Bacteria" if not taxid.isdigit(): - sys.exit("Invalid format of taxid {}".format(taxid)) + sys.exit(f"Invalid format of taxid {taxid}") try: - url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{}".format(taxid) + url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{taxid}" r = run_request(url) res = r.json() lineage = res.get("lineage", "") @@ -46,20 +45,18 @@ def main(taxid, include_kingdom, outfile): pass else: logging.error( - "Unknown lineage {}. Reporting default kingdom instead: Bacteria.".format( - lineage - ) + f"Unknown lineage {lineage}. Reporting default kingdom instead: Bacteria." ) - except: + except: # noqa E722 logging.error( - "Unable to identify lineage for taxid {}. " - "Reporting default kingdom instead: Bacteria.".format(taxid) + f"Unable to identify lineage for taxid {taxid}. " + "Reporting default kingdom instead: Bacteria." ) - logging.info("Reporting kingdom {} for taxid {}".format(kingdom, taxid)) + logging.info(f"Reporting kingdom {kingdom} for taxid {taxid}") if outfile: if include_kingdom: outfile_path, outfile_name = os.path.split(outfile) - outfile_new_name = "{}_{}".format(kingdom, outfile_name) + outfile_new_name = f"{kingdom}_{outfile_name}" outfile = os.path.join(outfile_path, outfile_new_name) with open(outfile, "w") as file_out: file_out.write(kingdom) @@ -90,7 +87,7 @@ def parse_args(): ) parser.add_argument( "--include-kingdom", - action='store_true', + action="store_true", help="Add the kingdom name at the beginning of the file name.", ) parser.add_argument( diff --git a/bin/make_combined_unifire_output.py b/bin/make_combined_unifire_output.py index f03abc4c..13769859 100755 --- a/bin/make_combined_unifire_output.py +++ b/bin/make_combined_unifire_output.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,15 +42,13 @@ def main(indir, outfile): def load_file(dbname, indir, filename, result): if not os.path.exists(os.path.join(indir, filename)): - logging.error( - "File {} does not exist in folder {}. Aborting.".format(filename, indir) - ) + logging.error(f"File {filename} does not exist in folder {indir}. Aborting.") sys.exit() - with open(os.path.join(indir, filename), "r") as file_in: + with open(os.path.join(indir, filename)) as file_in: for line in file_in: if not line.startswith("Evidence"): prot = line.split("\t")[1] - new_line = "{}\t{}".format(dbname, line) + new_line = f"{dbname}\t{line}" result.setdefault(prot, list()).append(new_line) return result diff --git a/bin/prepare_gff_for_conversion.py b/bin/prepare_gff_for_conversion.py new file mode 100755 index 00000000..588023c6 --- /dev/null +++ b/bin/prepare_gff_for_conversion.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import re + + +def main(infile, outfile): + with open(infile) as file_in, open(outfile, "w") as file_out: + fasta_flag = False + for line in file_in: + if line.startswith("##FASTA"): + fasta_flag = True + file_out.write(line) + elif line.startswith("#"): + file_out.write(line) + elif fasta_flag: + file_out.write(line) + else: + new_line = modify_line(line) + file_out.write(new_line + "\n") + + +def modify_line(line): + columns = line.strip().split("\t") + if columns[2] in ["region", "tRNA"]: + return line.strip() + + # Parse the 9th field into key-value pairs + attributes_dict = dict( + re.split(r"(? 0: + text_to_add = ",".join(formatted_list_to_move) + if "Dbxref" in attributes_dict: + attributes_dict["Dbxref"] = attributes_dict["Dbxref"] + "," + text_to_add + else: + attributes_dict["Dbxref"] = text_to_add + attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",").lstrip(",") + return attributes_dict + + +def collect_go_terms(attributes_dict): + # Function collects GO terms from across the 9th column into one list and deduplicates them + go_terms = list() + from_fields = ["Dbxref", "Ontology_term", "uf_ontology_term"] + for field in from_fields: + if field in attributes_dict: + annotations = attributes_dict[field].split(",") + for annotation in annotations[ + : + ]: # Using a copy of the list to avoid iteration issues + if annotation.startswith("GO:"): + go_terms.append(annotation) + annotations.remove(annotation) + if len(annotations) > 0: + modified_annotations = ",".join(annotations) + attributes_dict[field] = modified_annotations + else: + del attributes_dict[field] + # deduplicate + go_terms = list(set(go_terms)) + if len(go_terms) > 0: + if "Dbxref" in attributes_dict: + attributes_dict["Dbxref"] = ( + attributes_dict["Dbxref"] + "," + ",".join(go_terms) + ) + else: + attributes_dict["Dbxref"] = ",".join(go_terms) + attributes_dict["Dbxref"] = attributes_dict["Dbxref"].rstrip(",") + return attributes_dict + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Script processes final GFF to prepare for conversion " + "for ENA/GenBank." + ) + parser.add_argument( + "-i", "--infile", required=True, help="Path to the GFF file to plot" + ) + parser.add_argument( + "-o", "--outfile", required=True, help="Path to the output file" + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main( + args.infile, + args.outfile, + ) diff --git a/bin/prepare_unifire_input.py b/bin/prepare_unifire_input.py index 56b56c5d..470d992f 100755 --- a/bin/prepare_unifire_input.py +++ b/bin/prepare_unifire_input.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,13 +26,11 @@ def main(infile, taxid, outdir): check_dir(outdir) if not taxid.isdigit(): sys.exit( - "Taxid must consist of digits only. Taxid {} is not valid. Exiting.".format( - taxid - ) + f"Taxid must consist of digits only. Taxid {taxid} is not valid. Exiting." ) outfile = "proteins.fasta" outpath = os.path.join(outdir, outfile) - with open(outpath, "w") as file_out, open(infile, "r") as file_in: + with open(outpath, "w") as file_out, open(infile) as file_in: for line in file_in: if line.startswith(">"): formatted_line = reformat_line(line, taxid) @@ -53,10 +50,10 @@ def check_dir(directory_path): def reformat_line(line, taxid): line = line.lstrip(">").strip() id, description = line.split(maxsplit=1) - description = description.replace("\"", "").replace("'", "").replace("‘", "").replace("’", "") - formatted_line = ">tr|{id}|{description} OX={taxid}\n".format( - id=id, description=description, taxid=taxid + description = ( + description.replace('"', "").replace("'", "").replace("‘", "").replace("’", "") ) + formatted_line = f">tr|{id}|{description} OX={taxid}\n" return formatted_line diff --git a/bin/process_amrfinderplus_results.py b/bin/process_amrfinderplus_results.py index e3a77dbd..2774eda4 100755 --- a/bin/process_amrfinderplus_results.py +++ b/bin/process_amrfinderplus_results.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,7 +22,7 @@ def main(amr_file, outfile, version): - with open(amr_file, "r") as file_in, open(outfile, "w") as file_out: + with open(amr_file) as file_in, open(outfile, "w") as file_out: writer = csv.writer(file_out, delimiter="\t") writer.writerow(["##gff-version 3"]) for line in file_in: diff --git a/bin/process_crispr_results.py b/bin/process_crispr_results.py index 9485a582..34586135 100755 --- a/bin/process_crispr_results.py +++ b/bin/process_crispr_results.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,7 +41,7 @@ def create_gff( for gff in gffs: filename_base = gff.split("/")[-1].split(".")[0] if filename_base in hits: - with open(gff, "r") as gff_in: + with open(gff) as gff_in: for line in gff_in: if line.startswith("#") or len(line.strip()) == 0: continue @@ -83,7 +82,7 @@ def fix_capitalisation(line): def remove_leader_attribute(line): - """ GFFs produced by CRISPRCasFinder have a "leader" attribute without any value making the GFF invalid""" + """GFFs produced by CRISPRCasFinder have a "leader" attribute without any value making the GFF invalid""" if ";leader;" in line: line = line.replace("leader;", "") return line @@ -107,8 +106,8 @@ def fix_crispr_id(line): elif a.startswith("ID="): id = a.split("=")[1] # swap the values of "name" and "id" - fixed_annot = re.sub("ID={}".format(id), "ID={}".format(name), annot) - fixed_annot = re.sub("Name={}".format(name), "Name={}".format(id), fixed_annot) + fixed_annot = re.sub(f"ID={id}", f"ID={name}", annot) + fixed_annot = re.sub(f"Name={name}", f"Name={id}", fixed_annot) fields[8] = fixed_annot return "\t".join(fields) + "\n" @@ -116,7 +115,7 @@ def fix_crispr_id(line): def add_evidence_level(line, evidence_levels): crispr_id = get_crispr_id(line) try: - line = "{}evidence_level={}\n".format(line.strip(), evidence_levels[crispr_id]) + line = f"{line.strip()}evidence_level={evidence_levels[crispr_id]}\n" except Exception: logging.error(f"Cannot get evidence level for CRISPR {crispr_id}") return line @@ -183,17 +182,15 @@ def fix_gff_line(line, fasta): def fix_annotation(feature_seq, at_percentage, annotation): - annotation = annotation.replace("at%=0;", "at%={};".format(at_percentage)) - annotation = annotation.replace( - "sequence=UNKNOWN", "sequence={}".format(feature_seq) - ) + annotation = annotation.replace("at%=0;", f"at%={at_percentage};") + annotation = annotation.replace("sequence=UNKNOWN", f"sequence={feature_seq}") return annotation def calc_at_percentage(feature_seq): a = feature_seq.upper().count("A") t = feature_seq.upper().count("T") - return str(int((round((a + t) * 100 / len(feature_seq), 0)))) + return str(int(round((a + t) * 100 / len(feature_seq), 0))) def check_end_position(contig, end, seq_records): @@ -211,7 +208,7 @@ def process_tsv(tsv_report, tsv_output): hq_hits = list() evidence_levels = dict() with open(tsv_output, "w") as tsv_out: - with open(tsv_report, "r") as tsv_in: + with open(tsv_report) as tsv_in: for line in tsv_in: # ignore empty lines if len(line.strip()) == 0: @@ -225,7 +222,7 @@ def process_tsv(tsv_report, tsv_output): # add sequence basename to hits hits.append(parts[2]) # make crispr_id that the GFFs use - crispr_id = "{}_{}_{}".format(parts[1], parts[5], parts[6]) + crispr_id = f"{parts[1]}_{parts[5]}_{parts[6]}" # check if evidence level is high (2, 3 or 4) if parts[-1] in ["2", "3", "4"]: # add CRISPR ID (contig_start_end) diff --git a/bin/process_dbcan_result.py b/bin/process_dbcan_result.py index b7ab41a8..85cae776 100755 --- a/bin/process_dbcan_result.py +++ b/bin/process_dbcan_result.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2023-2024 EMBL - European Bioinformatics Institute # @@ -37,7 +36,7 @@ def load_cgcs(input_folder): for line in file_in: if not line.startswith("CGC#"): cgc, _, contig, _, start, end, _, _ = line.strip().split("\t") - cgc_id = "{}_{}".format(contig, cgc) + cgc_id = f"{contig}_{cgc}" if cgc_id in cgc_locations: if cgc_locations[cgc_id]["start"] > int(start): cgc_locations[cgc_id]["start"] = int(start) @@ -62,9 +61,9 @@ def print_gff(input_folder, outfile, dbcan_version, substrates, cgc_locations): cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = ( line.strip().split("\t") ) - cgc_id = "{}_{}".format(contig, cgc) + cgc_id = f"{contig}_{cgc}" protein_fam = protein_fam.replace(" ", "") - if not cgc_id in cgcs_printed: + if cgc_id not in cgcs_printed: substrate = ( substrates[cgc_id] if cgc_id in substrates @@ -82,23 +81,13 @@ def print_gff(input_folder, outfile, dbcan_version, substrates, cgc_locations): ) cgcs_printed.append(cgc_id) file_out.write( - "{}\tdbCAN:{}\t{}\t{}\t{}\t.\t{}\t.\tID={};Parent={};protein_family={}\n".format( - contig, - dbcan_version, - gene_type, - start, - end, - strand, - prot_id, - cgc_id, - protein_fam, - ) + f"{contig}\tdbCAN:{dbcan_version}\t{gene_type}\t{start}\t{end}\t.\t{strand}\t.\tID={prot_id};Parent={cgc_id};protein_family={protein_fam}\n" ) def load_substrates(input_folder): substrates = dict() - with open(os.path.join(input_folder, "substrate.out"), "r") as file_in: + with open(os.path.join(input_folder, "substrate.out")) as file_in: for line in file_in: if not line.startswith("#"): parts = line.strip().split("\t") @@ -117,9 +106,7 @@ def load_substrates(input_folder): if not substrate_ecami: substrate_ecami = "N/A" substrates[cgc] = ( - "substrate_dbcan-pul={};substrate_dbcan-sub={}".format( - substrate_pul, substrate_ecami - ) + f"substrate_dbcan-pul={substrate_pul};substrate_dbcan-sub={substrate_ecami}" ) print(substrates) return substrates @@ -129,7 +116,7 @@ def check_folder_completeness(input_folder): status = True for file in ["cgc_standard.out", "overview.txt", "substrate.out"]: if not os.path.exists(os.path.join(input_folder, file)): - logging.error("File {} does not exist.".format(file)) + logging.error(f"File {file} does not exist.") status = False return status diff --git a/bin/process_defensefinder_result.py b/bin/process_defensefinder_result.py index 539bca4c..35ffb410 100755 --- a/bin/process_defensefinder_result.py +++ b/bin/process_defensefinder_result.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,7 +34,7 @@ def main(input_folder, prokka_gff, outfile, df_version): def load_prokka(prokka_gff): prokka_data = dict() - with open(prokka_gff, "r") as file_in: + with open(prokka_gff) as file_in: for line in file_in: if not line.startswith("#"): if line.startswith(">"): @@ -55,7 +54,7 @@ def load_prokka(prokka_gff): def print_systems_to_file(system_path, gene_results, outfile, df_version, prokka_data): - with open(system_path, "r") as file_in, open(outfile, "w") as file_out: + with open(system_path) as file_in, open(outfile, "w") as file_out: writer = csv.writer(file_out, delimiter="\t") writer.writerow(["##gff-version 3"]) for line in file_in: @@ -67,7 +66,7 @@ def print_systems_to_file(system_path, gene_results, outfile, df_version, prokka sys_beg_index, sys_end_index, protein_in_syst_index, - ) = [ + ) = ( line.strip().split().index(field) for field in [ "sys_id", @@ -77,7 +76,7 @@ def print_systems_to_file(system_path, gene_results, outfile, df_version, prokka "sys_end", "protein_in_syst", ] - ] + ) else: cols = line.strip().split("\t") start = prokka_data[cols[sys_beg_index]]["start"] @@ -120,14 +119,14 @@ def print_systems_to_file(system_path, gene_results, outfile, df_version, prokka def load_genes(gene_path): gene_results = dict() - with open(gene_path, "r") as file_in: + with open(gene_path) as file_in: for line in file_in: if line.lower().startswith("replicon"): # get indices of fields - acc_index, gene_index, hit_status = [ + acc_index, gene_index, hit_status = ( line.strip().split().index(field) for field in ["hit_id", "gene_name", "hit_status"] - ] + ) else: fields = line.strip().split() gene_results.setdefault(fields[acc_index], dict()) diff --git a/bin/process_unifire_output.py b/bin/process_unifire_output.py index b4df861f..e09be9c2 100755 --- a/bin/process_unifire_output.py +++ b/bin/process_unifire_output.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2023 EMBL - European Bioinformatics Institute +# Copyright 2023-2024 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -67,7 +66,7 @@ def combine_and_print(arba_dict, unirule_dict, pirsr_dict, gff, outfile): fasta_flag = False list_of_dicts = [arba_dict, unirule_dict, pirsr_dict] - with open(outfile, "w") as file_out, open(gff, "r") as file_in: + with open(outfile, "w") as file_out, open(gff) as file_in: writer = csv.writer(file_out, delimiter="\t") for line in file_in: if fasta_flag is True: @@ -135,7 +134,7 @@ def escape_reserved_characters(combined_dict): for ch in reserved_characters: if ch in value: changes_flag = True - value = value.replace(ch, "\{}".format(ch)) + value = value.replace(ch, f"\{ch}") if changes_flag: remove_values.append(old_value) add_values.append(value) @@ -165,7 +164,7 @@ def get_id(col9): def load_pirsr(pirsr): results_dict = dict() - with open(pirsr, "r") as file_in: + with open(pirsr) as file_in: for line in file_in: if not line.startswith("Evidence"): if any(keyword in line for keyword in ["keyword", "comment.cofactor"]): @@ -203,7 +202,7 @@ def load_pirsr(pirsr): def load_unirule_arba(file): results_dict = dict() - with open(file, "r") as file_in: + with open(file) as file_in: for line in file_in: if not line.startswith("Evidence"): parts = line.strip().split("\t") diff --git a/conf/base.config b/conf/base.config index f3db5c58..e87751fe 100644 --- a/conf/base.config +++ b/conf/base.config @@ -67,6 +67,10 @@ process { withName: GECCO_RUN { memory = { check_max( 2.GB * task.attempt, 'memory' ) } } + withName: ANTISMASH_GETDB { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + } withName: SANNTIS { cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 2.GB * task.attempt, 'memory' ) } @@ -91,4 +95,10 @@ process { cpus = { check_max( 8 * task.attempt, 'cpus' ) } memory = { check_max( 10.GB * task.attempt, 'memory' ) } } + withName: PSEUDOFINDER { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + errorStrategy = 'retry' + maxRetries = 3 + } } diff --git a/conf/ebi_codon.config b/conf/ebi_codon.config index 7224d68d..e102c860 100644 --- a/conf/ebi_codon.config +++ b/conf/ebi_codon.config @@ -16,4 +16,6 @@ params { dbcan_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/dbcan/4.1.3-V12" bakta_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bakta/2024-01-19" + + pseudofinder_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/uniprotkb/2024_06" } diff --git a/conf/embl_hd.config b/conf/embl_hd.config new file mode 100644 index 00000000..040f9a07 --- /dev/null +++ b/conf/embl_hd.config @@ -0,0 +1,24 @@ +apptainer { + enabled = true + autoMounts = true + ociAutoPull = false // Disabled this as it's causing issues with the home folder of the usres running out of storage + pullTimeout = "3 hours" // The default is 20 minutes and fails with large images + envWhitelist = 'CUDA_VISIBLE_DEVICES' // Allow the bounding of GPU visible device variable into the containers + cacheDir = params.singularity_cachedir ?: System.getenv('SINGULARITY_CACHEDIR') ?: System.getenv('NFX_SINGULARITY_CACHEDIR') +} + +process { + maxRetries = 1 + cache = 'lenient' + afterScript = 'sleep 10' + queue = { (task.time <= 14.d && task.memory < 256.GB && (task.memory.div(task.cpus)) <= 4.GB) ? "htc-el8" : "bigmem" } + scratch = false +} + +executor { + name = "slurm" + queueSize = 200 + submitRateLimit = "10/1sec" + pollInterval = '10sec' + exitReadTimeout = "5 min" +} diff --git a/conf/modules.config b/conf/modules.config index 918b3d63..bd445502 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -181,7 +181,15 @@ process { ] } - withName: PROKKA { + withName: PSEUDOFINDER_POSTPROCESSING { + publishDir = [ + path: { "${params.outdir}/${meta.prefix}/functional_annotation/pseudofinder" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "PROKKA_STANDARD" { publishDir = [ path: { "${params.outdir}/${meta.prefix}/functional_annotation/prokka" }, mode: params.publish_dir_mode, diff --git a/conf/test.config b/conf/test.config index 193d5f89..43493b21 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,4 +26,8 @@ params { antismash_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/antismash/7.1.x" dbcan_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/dbcan/4.1.3-V12" + + bakta_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bakta/2024-01-19" + + pseudofinder_db = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/uniprotkb/2024_06/uniprot_sprot_2024_06.fasta" } diff --git a/media/mettannotator-schema.png b/media/mettannotator-schema.png index 4f3f9169..3bfae495 100644 Binary files a/media/mettannotator-schema.png and b/media/mettannotator-schema.png differ diff --git a/modules/local/amrfinder_plus.nf b/modules/local/amrfinder_plus.nf index 7d8d1832..ca61d7ce 100644 --- a/modules/local/amrfinder_plus.nf +++ b/modules/local/amrfinder_plus.nf @@ -14,6 +14,10 @@ process AMRFINDER_PLUS { script: """ + # this is needed as some environments are very picky with the TMP dir + export TMPDIR="\$PWD/tmp" + mkdir -p "\$PWD/tmp" + amrfinder --plus \ -n ${fna} \ -p ${faa} \ diff --git a/modules/local/amrfinder_plus_getdb.nf b/modules/local/amrfinder_plus_getdb.nf index 42bc9712..88fe885f 100644 --- a/modules/local/amrfinder_plus_getdb.nf +++ b/modules/local/amrfinder_plus_getdb.nf @@ -3,7 +3,12 @@ process AMRFINDER_PLUS_GETDB { tag "AMR Finder DB 2024-01-31.1" - container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/ncbi-amrfinderplus:3.12.8--h283d18e_0' : + 'biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' }" + + // amrfinder_index won't work if singularity mounts the workdir + scratch false publishDir "$params.dbs", mode: 'copy' @@ -12,6 +17,9 @@ process AMRFINDER_PLUS_GETDB { script: """ + export TMPDIR="\$PWD/tmp" + mkdir -p "\$PWD/tmp" + wget -r -nH --cut-dirs=5 \\ ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.12/2024-01-31.1/ diff --git a/modules/local/annotate_gff.nf b/modules/local/annotate_gff.nf index ead2aba7..67728234 100644 --- a/modules/local/annotate_gff.nf +++ b/modules/local/annotate_gff.nf @@ -4,7 +4,9 @@ process ANNOTATE_GFF { label 'process_nano' - container 'quay.io/microbiome-informatics/genomes-pipeline.python3base:v1.1' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/python:3.9' : + 'biocontainers/python:3.9' }" input: tuple val(meta), @@ -18,6 +20,7 @@ process ANNOTATE_GFF { file(gecco_gff), file(dbcan_gff), file(df_gff), + file(pseudofinder_gff), file(ips_annotations_tsv), // empty in fast mode file(sanntis_annotations_gff), // empty in fast mode file(arba), // empty in fast mode @@ -28,6 +31,8 @@ process ANNOTATE_GFF { output: tuple val(meta), path("*_annotations.gff"), emit: annotated_gff tuple val(meta), path("*_annotations_with_descriptions.gff"), optional: true, emit: annotated_desc_gff + tuple val(meta), path("*_submission.gff"), emit: submission_gff + tuple val(meta), path("*_pseudogene_report.txt"), emit: pseudogene_report path "versions.yml", emit: versions script: @@ -37,6 +42,7 @@ process ANNOTATE_GFF { def gecco_flag = ""; def dbcan_flag = ""; def df_flag = ""; + def pseudofinder_flag = ""; def ips_flag = ""; def hypothetical_ips_flags = ""; def sanntis_flag = ""; @@ -58,6 +64,9 @@ process ANNOTATE_GFF { if ( df_gff ) { df_flag = "--defense-finder ${df_gff}" } + if ( pseudofinder_gff ) { + pseudofinder_flag = "--pseudofinder ${pseudofinder_gff} --pseudogene-report ${meta.prefix}_pseudogene_report.txt" + } if ( ips_annotations_tsv ) { ips_flag = "-i ${ips_annotations_tsv}" hypothetical_ips_flags = [ @@ -80,7 +89,7 @@ process ANNOTATE_GFF { -r ${ncrna_tsv} \ -t ${trna_gff} \ -o ${meta.prefix}_temp.gff \ - ${crisprcas_flag} ${sanntis_flag} ${amrfinder_flag} \ + ${crisprcas_flag} ${sanntis_flag} ${amrfinder_flag} ${pseudofinder_flag}\ ${antismash_flag} ${gecco_flag} ${dbcan_flag} ${df_flag} ${ips_flag} if [ "${params.fast}" == "false" ]; then @@ -97,6 +106,10 @@ process ANNOTATE_GFF { -i ${hypothetical_tmp_gff} \\ -o ${meta.prefix}_annotations.gff ${hypothetical_ips_flags} + prepare_gff_for_conversion.py \\ + -i ${meta.prefix}_annotations.gff \\ + -o ${meta.prefix}_submission.gff + if [ "${params.fast}" == "false" ]; then add_interpro_descriptions.py \\ --ipr-entries ${interpro_entry_list}/entry.list \\ @@ -113,6 +126,8 @@ process ANNOTATE_GFF { stub: """ touch ${meta.prefix}_annotations.gff + touch ${meta.prefix}_submission.gff + touch ${meta.prefix}_pseudogene_report.txt if [ "${params.fast}" == "false" ]; then touch ${meta.prefix}_annotations_with_descriptions.gff fi diff --git a/modules/local/bakta_getdb.nf b/modules/local/bakta_getdb.nf index bc219d4b..09f34e1c 100644 --- a/modules/local/bakta_getdb.nf +++ b/modules/local/bakta_getdb.nf @@ -3,7 +3,9 @@ process BAKTA_GETDB { tag "BAKTA DB 2024-01-19" - container 'quay.io/biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/ncbi-amrfinderplus:3.12.8--h283d18e_0' : + 'biocontainers/ncbi-amrfinderplus:3.12.8--h283d18e_0' }" publishDir "$params.dbs", mode: 'copy' @@ -13,11 +15,14 @@ process BAKTA_GETDB { script: """ wget https://zenodo.org/record/10522951/files/db.tar.gz + tar -xzf db.tar.gz rm db.tar.gz mv db bakta + wget -r -nH --cut-dirs=5 \\ ftp://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/3.12/2024-01-31.1/ + rm -r bakta/amrfinderplus-db/* mv 2024-01-31.1 bakta/amrfinderplus-db/ diff --git a/modules/local/crisprcasfinder.nf b/modules/local/crisprcasfinder.nf index 1f633dde..1174575b 100644 --- a/modules/local/crisprcasfinder.nf +++ b/modules/local/crisprcasfinder.nf @@ -4,7 +4,7 @@ process CRISPRCAS_FINDER { label 'process_nano' - container 'quay.io/microbiome-informatics/genomes-pipeline.crisprcasfinder:4.3.2' + container 'quay.io/microbiome-informatics/genomes-pipeline.crisprcasfinder:4.3.2patchedv5' input: tuple val(meta), path(fasta) diff --git a/modules/local/dbcan.nf b/modules/local/dbcan.nf index ee303145..87976458 100644 --- a/modules/local/dbcan.nf +++ b/modules/local/dbcan.nf @@ -2,7 +2,9 @@ process DBCAN { tag "${meta.prefix}" - container 'quay.io/biocontainers/dbcan:4.1.2--pyhdfd78af_0' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/dbcan:4.1.2--pyhdfd78af_0' : + 'biocontainers/dbcan:4.1.2--pyhdfd78af_0' }" input: tuple val(meta), path(faa), path(gff) diff --git a/modules/local/dbcan_getdb.nf b/modules/local/dbcan_getdb.nf index 7690232e..3bf7de33 100644 --- a/modules/local/dbcan_getdb.nf +++ b/modules/local/dbcan_getdb.nf @@ -2,7 +2,9 @@ process DBCAN_GETDB { tag "DBCan 4.1.3_V12" - container 'quay.io/biocontainers/gnu-wget:1.18--h36e9172_9' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "${params.dbs}", mode: 'copy' diff --git a/modules/local/defense_finder.nf b/modules/local/defense_finder.nf index 3f9f27bb..f7e7363c 100644 --- a/modules/local/defense_finder.nf +++ b/modules/local/defense_finder.nf @@ -2,7 +2,9 @@ process DEFENSE_FINDER { tag "${meta.prefix}" - container 'biocontainers/defense-finder:1.2.0--pyhdfd78af_0' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/defense-finder:1.2.0--pyhdfd78af_0' : + 'biocontainers/defense-finder:1.2.0--pyhdfd78af_0' }" input: tuple val(meta), path(faa), path(prokka_gff) diff --git a/modules/local/defense_finder_getdb.nf b/modules/local/defense_finder_getdb.nf index 1af0e5cd..73df9fd9 100644 --- a/modules/local/defense_finder_getdb.nf +++ b/modules/local/defense_finder_getdb.nf @@ -3,7 +3,9 @@ process DEFENSE_FINDER_GETDB { tag "Defense Finder Models 1.2.3" - container 'quay.io/biocontainers/gnu-wget:1.18--h36e9172_9' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "$params.dbs", mode: 'copy' diff --git a/modules/local/detect_ncrna.nf b/modules/local/detect_ncrna.nf index adb3199b..edca4895 100644 --- a/modules/local/detect_ncrna.nf +++ b/modules/local/detect_ncrna.nf @@ -1,6 +1,8 @@ process DETECT_NCRNA { + tag "${meta.prefix}" + container 'quay.io/microbiome-informatics/genomes-pipeline.detect_rrna:v3.2' input: diff --git a/modules/local/detect_trna.nf b/modules/local/detect_trna.nf index 9303ce19..8e233b9d 100644 --- a/modules/local/detect_trna.nf +++ b/modules/local/detect_trna.nf @@ -25,18 +25,19 @@ process DETECT_TRNA { # tRNAscan-SE needs a tmp folder otherwise it will use the base TMPDIR (with no subfolder) # and that causes issues as other detect_trna process will crash when the files are cleaned - PROCESSTMP="\$(mktemp -d)" + PROCESSTMP="\$PWD/tmp" + mkdir -p "\$PWD/tmp" export TMPDIR="\${PROCESSTMP}" - # bash trap to clean the tmp directory - trap 'rm -r -- "\${PROCESSTMP}"' EXIT echo "[ Detecting tRNAs ]" tRNAscan-SE -${detected_kingdom} -Q \ -m ${meta.prefix}_stats.out \ -o ${meta.prefix}_trna.out \ - --gff ${meta.prefix}_trna.gff \ + --gff ${meta.prefix}_trna_temp.gff \ ${fasta} + add_locus_tag_to_trna.py -i ${meta.prefix}_trna_temp.gff -o ${meta.prefix}_trna.gff + parse_tRNA.py -i ${meta.prefix}_stats.out -o ${meta.prefix}_tRNA_20aa.out echo "Completed" diff --git a/modules/local/eggnog.nf b/modules/local/eggnog.nf index 04fee25b..c8418bce 100644 --- a/modules/local/eggnog.nf +++ b/modules/local/eggnog.nf @@ -52,7 +52,8 @@ process EGGNOG_MAPPER { --cpu ${task.cpus} \ --tax_scope 'prokaryota_broad' \ --annotate_hits_table ${annotation_hit_table} ${db_mem_flag} \ - -o ${meta.prefix} + -o ${meta.prefix} \ + --override cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/eggnog_getdb.nf b/modules/local/eggnog_getdb.nf index 3c72ef6c..be7a39a9 100644 --- a/modules/local/eggnog_getdb.nf +++ b/modules/local/eggnog_getdb.nf @@ -2,7 +2,9 @@ process EGGNOG_MAPPER_GETDB { tag "EGGNOG Mapper DB 5.0.2" - container "quay.io/biocontainers/gnu-wget:1.18--h36e9172_9" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "${params.dbs}", mode: "copy" diff --git a/modules/local/interpro_list_getdb.nf b/modules/local/interpro_list_getdb.nf index d8d25992..6b5a56a5 100644 --- a/modules/local/interpro_list_getdb.nf +++ b/modules/local/interpro_list_getdb.nf @@ -2,7 +2,9 @@ process INTEPRO_ENTRY_LIST_GETDB { tag "InterPro Entry List 94.0" - container 'quay.io/biocontainers/gnu-wget:1.18--h36e9172_9' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "${params.dbs}", mode: 'copy' diff --git a/modules/local/interproscan.nf b/modules/local/interproscan.nf index 7795eec8..6768f81b 100644 --- a/modules/local/interproscan.nf +++ b/modules/local/interproscan.nf @@ -8,7 +8,7 @@ process INTERPROSCAN { container 'quay.io/microbiome-informatics/genomes-pipeline.ips:5.62-94.0' containerOptions { - if (workflow.containerEngine == 'singularity') { + if (workflow.containerEngine in ['singularity', 'apptainer']) { return "--bind ${interproscan_db}/data:/opt/interproscan-5.62-94.0/data" } else { return "-v ./${interproscan_db}/data:/opt/interproscan-5.62-94.0/data" diff --git a/modules/local/interproscan_getdb.nf b/modules/local/interproscan_getdb.nf index 895ea6c1..08032200 100644 --- a/modules/local/interproscan_getdb.nf +++ b/modules/local/interproscan_getdb.nf @@ -2,7 +2,9 @@ process INTEPROSCAN_GETDB { tag "IPRS Scan 5.62-94.0" - container 'quay.io/biocontainers/gnu-wget:1.18--h36e9172_9' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "${params.dbs}", mode: 'copy' diff --git a/modules/local/lookup_kingdom.nf b/modules/local/lookup_kingdom.nf index 4648331e..6e2a7972 100644 --- a/modules/local/lookup_kingdom.nf +++ b/modules/local/lookup_kingdom.nf @@ -4,14 +4,16 @@ process LOOKUP_KINGDOM { label 'process_nano' - container 'quay.io/microbiome-informatics/genomes-pipeline.python3base:v1.1' + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'oras://community.wave.seqera.io/library/pip_requests_retry:4b6e4901d175ea72' : + 'community.wave.seqera.io/library/pip_requests_retry:d1a6734506332b90' }" input: tuple val(meta), path(fasta) output: tuple val(meta), env(detected_kingdom), emit: detected_kingdom - path "versions.yml", emit: versions + path "versions.yml", emit: versions script: """ diff --git a/modules/local/prokka.nf b/modules/local/prokka.nf index 8f9b1f09..2825a00e 100644 --- a/modules/local/prokka.nf +++ b/modules/local/prokka.nf @@ -2,10 +2,13 @@ process PROKKA { tag "${meta.prefix}" - container "quay.io/biocontainers/prokka:1.14.6--pl526_0" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/prokka:1.14.6--pl526_0' : + 'biocontainers/prokka:1.14.6--pl526_0' }" input: tuple val(meta), path(fasta), val(detected_kingdom) + val(mode) // standard or compliant output: tuple val(meta), file("${meta.prefix}_prokka/${meta.prefix}.gff"), emit: gff @@ -17,7 +20,20 @@ process PROKKA { path "versions.yml" , emit: versions script: + def compliant_flag = ""; + def rfam_flag = ""; + if ( mode == "compliant" ){ + compliant_flag = "--compliant" + rfam_flag = "--rfam" + } """ + # TMP folder issues in Prokka - https://github.com/tseemann/prokka/issues/402 + export TMPDIR="\$PWD/tmp" + mkdir -p "\$PWD/tmp" + + # Disable the Java VM performane gathering tool, for improved performance + export JAVA_TOOL_OPTIONS="-XX:-UsePerfData" + cat ${fasta} | tr '-' ' ' > ${meta.prefix}_cleaned.fasta prokka ${meta.prefix}_cleaned.fasta \ @@ -26,7 +42,9 @@ process PROKKA { --outdir ${meta.prefix}_prokka \ --prefix ${meta.prefix} \ --force \ - --locustag ${meta.prefix} + --locustag ${meta.prefix} \ + ${compliant_flag} \ + ${rfam_flag} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/pseudofinder.nf b/modules/local/pseudofinder.nf new file mode 100644 index 00000000..02f2e2e6 --- /dev/null +++ b/modules/local/pseudofinder.nf @@ -0,0 +1,60 @@ +process PSEUDOFINDER { + + tag "${meta.prefix}" + + container 'quay.io/microbiome-informatics/pseudofinder:1.1.0' + + input: + tuple val(meta), path(compliant_gbk) + tuple path(pseudofinder_db), val(db_version) + + output: + tuple val(meta), file("${meta.prefix}_pseudos.gff"), emit: pseudofinder_gff + path "versions.yml" , emit: versions + + script: + """ + pseudofinder.py annotate \ + -g ${compliant_gbk} \ + -db ${pseudofinder_db}/uniprot_sprot.fasta \ + -op ${meta.prefix} \ + -t ${task.cpus} \ + --diamond + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + Pseudofinder: 1.1.0 + Swiss-Prot: $db_version + END_VERSIONS + """ +} + +process PSEUDOFINDER_POSTPROCESSING { + + tag "${meta.prefix}" + + label 'process_nano' + + container 'quay.io/microbiome-informatics/genomes-pipeline.python3base:v1.1' + + input: + tuple val(meta), path(annotations_gff), path(compliant_gff, stageAs: "compliant/*"), path(pseudofinder_gff) + + output: + tuple val(meta), file("${meta.prefix}_processed_pseudogenes.gff"), emit: pseudofinder_processed_gff + path "versions.yml" , emit: versions + + script: + """ + adjust_pseudofinder_output.py \ + --pseudofinder-output ${pseudofinder_gff} \ + --standard-gff ${annotations_gff} \ + --compliant-gff ${compliant_gff} \ + -o ${meta.prefix}_processed_pseudogenes.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/pseudofinder_getdb.nf b/modules/local/pseudofinder_getdb.nf new file mode 100644 index 00000000..5544102d --- /dev/null +++ b/modules/local/pseudofinder_getdb.nf @@ -0,0 +1,24 @@ +process PSEUDOFINDER_GETDB { + + tag "Pseudofinder DB 2024_06" + + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" + + publishDir "${params.dbs}", mode: "copy" + + output: + tuple path("uniprot_sprot"), env("VERSION"), emit: pseudofinder_db + + script: + """ + wget ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/pseudofinder/uniprot_sprot_2024_06.tar.gz + + mkdir -p uniprot_sprot + + tar -zxvf uniprot_sprot_2024_06.tar.gz -C uniprot_sprot + + VERSION=\$(cat uniprot_sprot/VERSION.txt) + """ +} diff --git a/modules/local/rfam_getmodels.nf b/modules/local/rfam_getmodels.nf index 1bfbd80f..6fbedb55 100644 --- a/modules/local/rfam_getmodels.nf +++ b/modules/local/rfam_getmodels.nf @@ -3,7 +3,9 @@ process RFAM_GETMODELS { tag "Rfam models - release 14.9" - container "quay.io/biocontainers/gnu-wget:1.18--h36e9172_9" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" publishDir "$params.dbs/rfam_models", mode: 'copy' diff --git a/modules/local/unifire.nf b/modules/local/unifire.nf index cacb401f..7c8d2313 100644 --- a/modules/local/unifire.nf +++ b/modules/local/unifire.nf @@ -7,7 +7,7 @@ process UNIFIRE { container "dockerhub.ebi.ac.uk/uniprot-public/unifire:2023.4" containerOptions { - if (workflow.containerEngine == 'singularity') { + if (workflow.containerEngine in ['singularity', 'apptainer']) { return "--bind unifire:/volume" } else { return "-v ./unifire:/volume" diff --git a/modules/nf-core/bakta/bakta/bakta-bakta.diff b/modules/nf-core/bakta/bakta/bakta-bakta.diff index 0d2045fa..80947870 100644 --- a/modules/nf-core/bakta/bakta/bakta-bakta.diff +++ b/modules/nf-core/bakta/bakta/bakta-bakta.diff @@ -15,11 +15,11 @@ Changes in module 'nf-core/bakta/bakta' - tag "$meta.id" + tag "$meta.prefix" label 'process_medium' - + conda "${moduleDir}/environment.yml" @@ -8,22 +8,22 @@ 'biocontainers/bakta:1.9.3--pyhdfd78af_0' }" - + input: - tuple val(meta), path(fasta) - path db @@ -27,7 +27,7 @@ Changes in module 'nf-core/bakta/bakta' - path prodigal_tf + tuple val(meta), path(fasta), path(detected_kingdom) + tuple path(db), val(db_version) - + output: tuple val(meta), path("${prefix}.embl") , emit: embl tuple val(meta), path("${prefix}.faa") , emit: faa @@ -43,10 +43,10 @@ Changes in module 'nf-core/bakta/bakta' + tuple val(meta), path("${prefix}.svg") , emit: svg + tuple val(meta), path("${prefix}.png") , emit: png path "versions.yml" , emit: versions - + when: @@ -31,18 +31,22 @@ - + script: def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" @@ -71,7 +71,7 @@ Changes in module 'nf-core/bakta/bakta' + --skip-ncrna \\ + --skip-ncrna-region \\ + --skip-crispr - + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/bakta/bakta/tests/main.nf.test b/modules/nf-core/bakta/bakta/tests/main.nf.test index bdceb16e..9ee7ed8c 100644 --- a/modules/nf-core/bakta/bakta/tests/main.nf.test +++ b/modules/nf-core/bakta/bakta/tests/main.nf.test @@ -25,7 +25,7 @@ nextflow_process { when { process { - """ + """ input[0] = [ [ id:'test', single_end:false ], // meta map file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true) @@ -62,7 +62,7 @@ nextflow_process { when { process { - """ + """ input[0] = [[id: 'stub'],file('stub')] input[1] = [] input[2] = [] diff --git a/modules/nf-core/bakta/bakta/tests/main.nf.test.snap b/modules/nf-core/bakta/bakta/tests/main.nf.test.snap index 40e30c36..4b4c50a7 100644 --- a/modules/nf-core/bakta/bakta/tests/main.nf.test.snap +++ b/modules/nf-core/bakta/bakta/tests/main.nf.test.snap @@ -188,4 +188,4 @@ }, "timestamp": "2024-03-14T09:11:15.532858932" } -} \ No newline at end of file +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index da033408..7ee62bfe 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,10 +4,11 @@ """Provide functions to merge multiple versions.yml files.""" -import yaml import platform from textwrap import dedent +import yaml + def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" @@ -58,7 +59,9 @@ def main(): } with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + versions_by_process = ( + yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + ) # aggregate versions by the module name (derived from fully-qualified process name) versions_by_module = {} diff --git a/modules/nf-core/gecco/run/gecco-run.diff b/modules/nf-core/gecco/run/gecco-run.diff index 68bb9873..ec491791 100644 --- a/modules/nf-core/gecco/run/gecco-run.diff +++ b/modules/nf-core/gecco/run/gecco-run.diff @@ -2,8 +2,8 @@ Changes in module 'nf-core/gecco/run' --- modules/nf-core/gecco/run/main.nf +++ modules/nf-core/gecco/run/main.nf @@ -13,12 +13,12 @@ - - + + output: - tuple val(meta), path("*.genes.tsv") , optional: true, emit: genes - tuple val(meta), path("*.features.tsv") , emit: features @@ -17,12 +17,12 @@ Changes in module 'nf-core/gecco/run' + tuple val(meta), path("*clusters*.gff"), optional: true, emit: gff + tuple val(meta), path("*.json"), optional: true, emit: json path "versions.yml" , emit: versions - + when: @@ -39,6 +39,8 @@ $custom_model \\ $custom_hmm - + + gecco convert clusters -i ./ --format gff + cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/quast/quast.diff b/modules/nf-core/quast/quast.diff index 999ae392..705c64e8 100644 --- a/modules/nf-core/quast/quast.diff +++ b/modules/nf-core/quast/quast.diff @@ -7,19 +7,19 @@ Changes in module 'nf-core/quast' - label 'process_medium' + tag "$meta.prefix" + label 'process_single' - + conda "bioconda::quast=5.2.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -9,7 +9,6 @@ - + input: tuple val(meta) , path(consensus) - tuple val(meta2), path(fasta) tuple val(meta3), path(gff) - + output: @@ -25,13 +24,12 @@ - + script: def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" @@ -35,7 +35,7 @@ Changes in module 'nf-core/quast' --threads $task.cpus \\ $args \\ @@ -50,10 +48,9 @@ - + stub: def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" diff --git a/nextflow.config b/nextflow.config index 14cf6346..9b6ba819 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,8 +45,11 @@ params { bakta_db = null bakta_db_version = "2024-01-19" + pseudofinder_db = null + pseudofinder_db_version = "2024_06" + // Tool options - bakta = false + bakta = false // MultiQC options multiqc_config = null @@ -72,6 +75,9 @@ params { max_cpus = 16 max_time = '240.h' + // Each profile is responsible for using this setting + singularity_cachedir = '' + // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false @@ -173,7 +179,7 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } + test { includeConfig 'conf/test.config' } local { params { @@ -228,6 +234,11 @@ profiles { pollInterval = "10 sec" } } + + // EMBL Heidelberg + embl_hd { + includeConfig 'conf/embl_hd.config' + } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -282,7 +293,7 @@ manifest { description = """ME TT assembly annotation pipeline""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.3' + version = '1.4.0' doi = 'https://doi.org/10.1101/2024.07.11.603040' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 54f821d6..16e27e54 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/ebi-metagenomics/mettannotator/master/nextflow_schema.json", "title": "ebi-metagenomics/mettannotator pipeline parameters", "description": "ME TT assembly annotation pipeline", @@ -36,8 +36,8 @@ }, "bakta": { "type": "boolean", - "default": false, - "description": "Use Bakta instead of Prokka for CDS annotation. Prokka will still be used for archaeal genomes." + "description": "Use Bakta instead of Prokka for CDS annotation. Prokka will still be used for archaeal genomes.", + "fa_icon": "fas fa-wrench" }, "email": { "type": "string", @@ -164,6 +164,17 @@ "type": "string", "default": "2024-01-19", "description": "The Bakta reference database version." + }, + "pseudofinder_db": { + "type": "string", + "format": "directory-path", + "description": "Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder.", + "help_text": "Set this variable to the path of the folder containing uniprot_sprot.fasta file to be used as the database." + }, + "pseudofinder_db_version": { + "type": "string", + "default": "2024_06", + "description": "SwissProt version." } }, "fa_icon": "fas fa-database" @@ -310,6 +321,11 @@ "type": "string", "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" + }, + "singularity_cachedir": { + "type": "string", + "fa_icon": "fab fa-docker", + "description": "The singularity/apptainer cache directory" } } } diff --git a/postprocessing/add_mobilome_to_gff.py b/postprocessing/add_mobilome_to_gff.py index aaa6ecec..cd706f8c 100644 --- a/postprocessing/add_mobilome_to_gff.py +++ b/postprocessing/add_mobilome_to_gff.py @@ -1,5 +1,19 @@ #!/usr/bin/env python3 +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import argparse import re @@ -10,7 +24,7 @@ def main(mobilome_file, infile, outfile): list() ) # record which mobilome lines have already been printed to the output file fasta_flag = False - with open(infile, "r") as file_in, open(outfile, "w") as file_out: + with open(infile) as file_in, open(outfile, "w") as file_out: previous_contig = "" for line in file_in: if line.startswith("#"): @@ -107,9 +121,7 @@ def sanity_check(mobilome_dict, printed_list): mobilome_count = sum(len(inner_dict) for inner_dict in mobilome_dict.values()) assert ( printed_list_length == mobilome_count - ), "The number of mobilome entries added to the GFF does not match the expected count: added {}, expected{}".format( - printed_list_length, mobilome_count - ) + ), f"The number of mobilome entries added to the GFF does not match the expected count: added {printed_list_length}, expected{mobilome_count}" def look_for_lines_to_print(mobilome_dict, contig, start, printed_list): @@ -154,7 +166,7 @@ def check_overlap(dictionary, sequence, start, end): def load_mobilome(infile): mobilome_dict = dict() - with open(infile, "r") as file_in: + with open(infile) as file_in: for line in file_in: if line.startswith("#"): if line.startswith("##FASTA"): diff --git a/postprocessing/duplicate_resolution.py b/postprocessing/duplicate_resolution.py index d247fbac..baf0422a 100755 --- a/postprocessing/duplicate_resolution.py +++ b/postprocessing/duplicate_resolution.py @@ -1,5 +1,19 @@ #!/usr/bin/env python3 +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import argparse import logging import re @@ -27,14 +41,14 @@ def main(reference, target, outfile, species): replacements = dict() # gene names to change replacements_ids = dict() # changes to make if alias is None reverse = list() # undo some planned changes (remove these from replacements) - stats_out = open("{}_stats.txt".format(species), "w") + stats_out = open(f"{species}_stats.txt", "w") stats_out.write("Gene\tCopy number\tReplaced\tWhy not replaced/ Comment\n") for base in dedupl_dict: if ( len(dedupl_dict[base]) > 1 ): # there are multiple gene copies, try to deduplicate printed_stat = "" - printed_stat += "{}\t{}\t".format(base, str(len(dedupl_dict[base]))) + printed_stat += f"{base}\t{str(len(dedupl_dict[base]))}\t" counter += 1 unknown_counter = 0 decision_dict = dict() @@ -55,7 +69,7 @@ def main(reference, target, outfile, species): # Decision dictionary has the following format (2 examples here): # {'fabHA': ['BACUNI_03004', 'BACUNI_04476']} # {'der': ['BACUNI_03006'], 'feoB': ['BACUNI_04461'], 'hydF': ['BACUNI_02997']}} - logging.debug("Decision dictionary: {}".format(decision_dict)) + logging.debug(f"Decision dictionary: {decision_dict}") if len(decision_dict) == 0: # unable to resolve duplicates, leave records as they are stats_dict["unknowns_only"] = stats_dict.get("unknowns_only", 0) + 1 @@ -78,18 +92,14 @@ def main(reference, target, outfile, species): else: if alias in replacements: sys.exit( - "Error: something went wrong, alias {} is already in replacements".format( - alias - ) + f"Error: something went wrong, alias {alias} is already in replacements" ) replacements[alias] = base stats_dict["replaced"] = stats_dict.get("replaced", 0) + 1 replace = True printed_stat += "Yes\t\n" else: - logging.debug( - "Case is not clear {} {}".format(dedupl_dict[base], decision_dict) - ) + logging.debug(f"Case is not clear {dedupl_dict[base]} {decision_dict}") unique = check_value_uniqueness( decision_dict ) # check that the alias doesn't repeat @@ -140,10 +150,10 @@ def main(reference, target, outfile, species): if alias not in replacements: replacements[alias] = gene_name else: - ("Alias {} is already in replacements".format(alias)) + (f"Alias {alias} is already in replacements") sys.exit() - logging.debug("Replaced one for one {}".format(decision_dict)) + logging.debug(f"Replaced one for one {decision_dict}") stats_dict["replaced"] = stats_dict.get("replaced", 0) + 1 replace = True printed_stat += "Yes\t\n" @@ -192,14 +202,12 @@ def main(reference, target, outfile, species): ) if made_replacements != (len(replacements) + len(replacements_ids)): sys.exit( - "Made {} replacements but expected {}".format( - str(made_replacements), str(len(replacements)) - ) + f"Made {str(made_replacements)} replacements but expected {str(len(replacements))}" ) - logging.info("Total number of groups: {}".format(counter)) - logging.info("Replacements: {}".format(replacements)) - logging.info("Made replacements: {}".format(made_replacements)) - logging.info("Reverse {}".format(reverse)) + logging.info(f"Total number of groups: {counter}") + logging.info(f"Replacements: {replacements}") + logging.info(f"Made replacements: {made_replacements}") + logging.info(f"Reverse {reverse}") stats_out.close() return stats_dict @@ -234,7 +242,7 @@ def try_to_remove_more_underscores( "remaining after other replacements; no stable ID was assigned to this gene\n", ).format(key) return replacements, printed_stat, replacements_ids - if not deduplication_section[key]["alias"] in bacunis_already_replaced: + if deduplication_section[key]["alias"] not in bacunis_already_replaced: replacements[deduplication_section[key]["alias"]] = key.split("_")[ 0 ] @@ -249,8 +257,8 @@ def try_to_remove_more_underscores( def make_replacement_file(target, outfile, replacements, replacements_ids, species): seq_flag = False count_replacements = list() - rep_out = open("{}_replacements.txt".format(species), "w") - with open(target, "r") as file_in, open(outfile, "w") as file_out: + rep_out = open(f"{species}_replacements.txt", "w") + with open(target) as file_in, open(outfile, "w") as file_out: for line in file_in: if line.startswith("#"): if line.startswith("##FASTA"): @@ -266,7 +274,7 @@ def make_replacement_file(target, outfile, replacements, replacements_ids, speci alias_pattern = r";Alias=(.*?);" try: alias_name = re.search(alias_pattern, fields[8]).group(1) - except: + except: # noqa: E722 alias_name = None gene_alias_name = alias_name gene_id = id @@ -274,26 +282,24 @@ def make_replacement_file(target, outfile, replacements, replacements_ids, speci gene_pattern = r";gene=(.*?);" try: gene_name = re.search(gene_pattern, fields[8]).group(1) - except: + except: # noqa E722 gene_name = None if fields[2] in ["CDS", "mRNA", "exon"]: alias_name = gene_alias_name id = gene_id if alias_name and alias_name in replacements: if fields[2] == "gene": - rep_out.write( - "{}\t{}\n".format(gene_name, replacements[alias_name]) - ) + rep_out.write(f"{gene_name}\t{replacements[alias_name]}\n") line = line.replace(gene_name, replacements[alias_name]) count_replacements.append(alias_name) - print("ID is {}".format(id)) + print(f"ID is {id}") if id in replacements_ids: print("ID IS IN IDS") if fields[2] == "gene": - rep_out.write( - "{}\t{}\n".format(gene_name, replacements_ids[id]) + rep_out.write(f"{gene_name}\t{replacements_ids[id]}\n") + print( + "made replacement of", gene_name, replacements_ids[id] ) - print("made replacement of", gene_name, replacements_ids[id]) line = line.replace(gene_name, replacements_ids[id]) count_replacements.append(id) file_out.write(line) @@ -312,9 +318,7 @@ def resolve_duplicate( base, printed_stat, ): - logging.debug( - "=============>RESOLVING {} {}".format(genes_to_resolve, reference_dict) - ) + logging.debug(f"=============>RESOLVING {genes_to_resolve} {reference_dict}") resolved = dict() duplicate_removed = False for gene in reference_dict: @@ -338,7 +342,7 @@ def resolve_duplicate( printed_stat += "Yes\t\n" else: sys.exit("Unknown case") - logging.debug("============>Resolved {}".format(resolved)) + logging.debug(f"============>Resolved {resolved}") return replacements, reverse, stats_dict, printed_stat @@ -368,7 +372,7 @@ def load_duplicates(infile): dedupl_dict = dict() gene_occurrence_counter = dict() alias_repeats = dict() - with open(infile, "r") as file_in: + with open(infile) as file_in: for line in file_in: if not line.startswith("#"): if line.startswith(">"): @@ -380,15 +384,15 @@ def load_duplicates(infile): alias_pattern = r";Alias=(.*?);" try: gene_name = re.search(gene_pattern, annot).group(1) - except: + except: # noqa E722 gene_name = None try: locus_name = re.search(locus_pattern, annot).group(1) - except: + except: # noqa E722 locus_name = None try: alias_name = re.search(alias_pattern, annot).group(1) - except: + except: # noqa E722 alias_name = None if alias_name: alias_repeats[alias_name] = alias_repeats.get(alias_name, 0) + 1 @@ -426,7 +430,7 @@ def get_prefix(species): def load_ref_genenames(reference, species_prefix): alias_genename_dict = dict() - with open(reference, "r") as file_in: + with open(reference) as file_in: for line in file_in: if not line.startswith("#"): _, _, feature, _, _, _, _, _, annot = line.strip().split("\t") diff --git a/postprocessing/mob_merger_v2.py b/postprocessing/mob_merger_v2.py index a222843d..f063f8fb 100755 --- a/postprocessing/mob_merger_v2.py +++ b/postprocessing/mob_merger_v2.py @@ -1,15 +1,23 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 + +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# ruff: noqa: F841, N806, N803 -import re import argparse -import sys -import os.path - - -##### This script merge the coordinates of predicted MGEs using the Mobilome Annotation Pipeline v2.0 and proMGE v2023 -##### Alejandra Escobar, EMBL-EBI -##### Dec 6, 2023 -##### Apr 12, 2024 -> Fixes on merger function to debug overlapping coordinates of multiple nested MGEs +import re def gff_parser(current_line): @@ -49,7 +57,7 @@ def momo_parser(mobannot): boundaries = ["terminal_inverted_repeat_element", "direct_repeat_element"] - with open(mobannot, "r") as input_file: + with open(mobannot) as input_file: for line in input_file: line = line.rstrip() # Annotation lines have exactly 9 columns @@ -154,7 +162,7 @@ def promge_parser(promge, meta): ("cellular", "cellular_recombinase"), ] mge_desc = {} - with open(meta, "r") as input_meta: + with open(meta) as input_meta: next(input_meta) for line in input_meta: ( @@ -182,7 +190,7 @@ def promge_parser(promge, meta): promge_dict = {} mgeR = {} - with open(promge, "r") as input_gff: + with open(promge) as input_gff: for line in input_gff: line = line.rstrip() # Annotation lines have exactly 9 columns diff --git a/postprocessing/update_gff_with_alias.py b/postprocessing/update_gff_with_alias.py index 4c59e29c..7f1df048 100644 --- a/postprocessing/update_gff_with_alias.py +++ b/postprocessing/update_gff_with_alias.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2024 EMBL - European Bioinformatics Institute # @@ -23,7 +22,7 @@ def main(infile, outfile, liftoff_file, field_from, field_to): alias_dictionary = load_aliases(liftoff_file, field_from) fasta_flag = False fields_to_check = return_fields_to_print_to(field_to) - with open(infile, "r") as file_in, open(outfile, "w") as file_out: + with open(infile) as file_in, open(outfile, "w") as file_out: for line in file_in: if line.startswith("#"): if line.startswith("##FASTA"): @@ -35,14 +34,15 @@ def main(infile, outfile, liftoff_file, field_from, field_to): fields = line.strip().split("\t") if fields[2] in fields_to_check: attributes_dict = dict( - re.split(r"(?= 9 and columns[2] == "CDS": start, end = columns[3], columns[4] essentiality = coordinates_dict.get((start, end)) if essentiality: - columns[8] += f";transit_combined_hmm_gumbel_essentiality={essentiality}" - matched_coordinates.add((start,end)) - updated_lines.append('\t'.join(columns) + '\n') + columns[ + 8 + ] += f";transit_combined_hmm_gumbel_essentiality={essentiality}" + matched_coordinates.add((start, end)) + updated_lines.append("\t".join(columns) + "\n") else: - updated_lines.append(line) + updated_lines.append(line) return updated_lines, matched_coordinates + def write_updated_gff(output_file, updated_lines): - with open(output_file, 'w') as file: + with open(output_file, "w") as file: for line in updated_lines: file.write(line) + def write_not_matched_file(not_matched_file, coordinates_dict, matched_coordinates): - not_matched_lines = [f"{start}\t{end}\t{essentiality}\n" for (start, end), (essentiality) in coordinates_dict.items() if (start, end) not in matched_coordinates] - with open(not_matched_file, 'w') as file: + not_matched_lines = [ + f"{start}\t{end}\t{essentiality}\n" + for (start, end), (essentiality) in coordinates_dict.items() + if (start, end) not in matched_coordinates + ] + with open(not_matched_file, "w") as file: file.writelines(not_matched_lines) + if __name__ == "__main__": if len(sys.argv) != 5: - print(""" + print( + """ Usage: python script.py Match coordinates from essentiality file, produce modified GFF with new key-value pair and a file containing unmatched essentiality lines. The key inserted into the 9th column of CDS lines only is transit_combined_hmm_gumbel_essentiality (all lower case to fit with GFF specs) Note that the input is a summary of essentiality data (i.e. medium ignored for now) - """) + """ + ) sys.exit(1) gff_file = sys.argv[1] @@ -54,6 +81,8 @@ def write_not_matched_file(not_matched_file, coordinates_dict, matched_coordinat not_matched_file = sys.argv[4] coordinates_dict = read_coordinates_file(coordinates_file) - updated_lines, matched_coordinates = update_gff_with_status(gff_file, coordinates_dict) + updated_lines, matched_coordinates = update_gff_with_status( + gff_file, coordinates_dict + ) write_updated_gff(output_file, updated_lines) write_not_matched_file(not_matched_file, coordinates_dict, matched_coordinates) diff --git a/postprocessing/update_gff_with_mapped_uniprot_ids.py b/postprocessing/update_gff_with_mapped_uniprot_ids.py index b1fdb655..bbf6cb9c 100644 --- a/postprocessing/update_gff_with_mapped_uniprot_ids.py +++ b/postprocessing/update_gff_with_mapped_uniprot_ids.py @@ -1,17 +1,31 @@ #!/usr/bin/env python3 +# Copyright 2023-2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import argparse -import sys import os -import subprocess import re +import subprocess +import sys def load_uniprot_descriptions(uniprot_tsv): uniprot_descriptions = dict() entry_index = None protein_name_index = None - with open(uniprot_tsv, "r") as file_in: + with open(uniprot_tsv) as file_in: for line in file_in: if line.startswith("Entry"): fields = line.strip().split("\t") @@ -24,7 +38,9 @@ def load_uniprot_descriptions(uniprot_tsv): sys.exit("Could not get uniprot indices to load descriptions") else: fields = line.strip().split("\t") - uniprot_descriptions[fields[entry_index]] = escape_reserved_characters(fields[protein_name_index]) + uniprot_descriptions[fields[entry_index]] = escape_reserved_characters( + fields[protein_name_index] + ) assert len(uniprot_descriptions) > 0, "Uniprot description dictionary is empty." return uniprot_descriptions @@ -33,10 +49,10 @@ def escape_reserved_characters(string): reserved_characters = [";", "=", "&"] for ch in reserved_characters: if ch in string: - if ch == ';': + if ch == ";": string = string.replace(ch, "/") else: - string = string.replace(ch, "\{}".format(ch)) + string = string.replace(ch, f"\{ch}") return string @@ -66,7 +82,7 @@ def run_blast(pipeline_fasta, uniprot_fasta, blast_output): def extract_best_hits(blast_output): best_hits = {} - with open(blast_output, "r") as file: + with open(blast_output) as file: for line in file: fields = line.strip().split("\t") query_id = fields[0] @@ -90,13 +106,13 @@ def create_mapping_file(best_hits, mapping_file): def update_gff_with_mapping(gff_file, mapping_file, output_file, uniprot_descriptions): mapping_dict = {} - with open(mapping_file, "r") as file: + with open(mapping_file) as file: next(file) # Skip header for line in file: query, result, _, _ = line.strip().split("\t") mapping_dict[result] = query - with open(gff_file, "r") as infile, open(output_file, "w") as outfile: + with open(gff_file) as infile, open(output_file, "w") as outfile: for line in infile: if line.startswith("#"): outfile.write(line) @@ -116,12 +132,20 @@ def update_gff_with_mapping(gff_file, mapping_file, output_file, uniprot_descrip identifier = identifier[len("CDS:") :] if identifier in mapping_dict: if "Dbxref" in attributes_dict: - attributes_dict["Dbxref"] += f",UniProt:{mapping_dict[identifier]}" + attributes_dict[ + "Dbxref" + ] += f",UniProt:{mapping_dict[identifier]}" else: - attributes_dict["Dbxref"] = f"UniProt:{mapping_dict[identifier]}" + attributes_dict["Dbxref"] = ( + f"UniProt:{mapping_dict[identifier]}" + ) if mapping_dict[identifier] in uniprot_descriptions: - attributes_dict["uniprot_prot_name"] = uniprot_descriptions[mapping_dict[identifier]] - fields[8] = ";".join([f"{key}={value}" for key, value in attributes_dict.items()]) + attributes_dict["uniprot_prot_name"] = uniprot_descriptions[ + mapping_dict[identifier] + ] + fields[8] = ";".join( + [f"{key}={value}" for key, value in attributes_dict.items()] + ) outfile.write("\t".join(fields) + "\n") @@ -181,11 +205,7 @@ def update_gff_with_mapping(gff_file, mapping_file, output_file, uniprot_descrip species = args.species if " " in species: - sys.exit( - "The species parameter {} is invalid. Spaces are not allowed".format( - species - ) - ) + sys.exit(f"The species parameter {species} is invalid. Spaces are not allowed") if args.include_description: if not args.uniprot_tsv: sys.exit("Must provide a UniProt TSV if the --description flag is used") @@ -213,6 +233,8 @@ def update_gff_with_mapping(gff_file, mapping_file, output_file, uniprot_descrip # Update GFF with mapping if args.include_description: - update_gff_with_mapping(gff_file, mapping_file, output_gff_file, uniprot_descriptions) + update_gff_with_mapping( + gff_file, mapping_file, output_gff_file, uniprot_descriptions + ) else: update_gff_with_mapping(gff_file, mapping_file, output_gff_file, dict()) diff --git a/preprocessing/generate_input_file.py b/preprocessing/generate_input_file.py index be1bd828..971d9ef7 100644 --- a/preprocessing/generate_input_file.py +++ b/preprocessing/generate_input_file.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # Copyright 2024 EMBL - European Bioinformatics Institute # @@ -26,21 +25,19 @@ def main(infile, input_dir, bat_dir, outfile, no_prefix): all_prefixes = list() if not os.path.exists(infile): - sys.exit("Input file {} does not exist!".format(infile)) + sys.exit(f"Input file {infile} does not exist!") if not os.path.exists(bat_dir): - sys.exit("BAT directory {} does not exist!".format(bat_dir)) + sys.exit(f"BAT directory {bat_dir} does not exist!") if not os.path.exists(input_dir): - sys.exit("Input directory {} does not exist!".format(input_dir)) + sys.exit(f"Input directory {input_dir} does not exist!") - with open(infile, "r") as f_in, open(outfile, "w") as f_out: + with open(infile) as f_in, open(outfile, "w") as f_out: f_out.write("prefix,assembly,taxid\n") for line in f_in: line = line.strip() if line.endswith(("gz", "zip")): sys.exit( - "Fasta file {} is archived. Please provide only unarchived files.".format( - line - ) + f"Fasta file {line} is archived. Please provide only unarchived files." ) taxid = parse_taxid(line, bat_dir) if not no_prefix: @@ -55,18 +52,18 @@ def main(infile, input_dir, bat_dir, outfile, no_prefix): all_prefixes.append(prefix) full_path = os.path.join(input_dir, line) if no_prefix: - f_out.write(",{},{}\n".format(full_path, taxid)) + f_out.write(f",{full_path},{taxid}\n") else: - f_out.write("{},{},{}\n".format(prefix, full_path, taxid)) - logging.info("Saved results to {}".format(outfile)) + f_out.write(f"{prefix},{full_path},{taxid}\n") + logging.info(f"Saved results to {outfile}") def parse_taxid(fasta_file, bat_dir): bat_filename = os.path.splitext(fasta_file)[0] + ".bin2classification.txt" bat_filepath = os.path.join(bat_dir, bat_filename) if not os.path.exists(bat_filepath): - sys.exit("Bat file {} does not exist!".format(bat_filepath)) - with open(bat_filepath, "r") as f_in: + sys.exit(f"Bat file {bat_filepath} does not exist!") + with open(bat_filepath) as f_in: # skip header f_in.readline() taxid_string = f_in.readline().strip().split("\t")[3] @@ -74,8 +71,8 @@ def parse_taxid(fasta_file, bat_dir): if not taxid: taxid = "FILL IN MANUALY" logging.warning( - "No taxid found for taxid {}. If taxid is known, fill it out manually in the generated " - "output file.".format(taxid) + f"No taxid found for taxid {taxid}. If taxid is known, fill it out manually in the generated " + "output file." ) return taxid @@ -106,7 +103,7 @@ def parse_args(): dest="infile", required=True, help="A file containing a list of genome files to include (file name only, with file extension, unzipped, " - "one file per line). ", + "one file per line). ", ) parser.add_argument( "-d", diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 56110621..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. -# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.ruff] -line-length = 120 -target-version = "py38" -cache-dir = "~/.cache/ruff" - -[tool.ruff.lint] -select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] - -[tool.ruff.lint.isort] -known-first-party = ["nf_core"] - -[tool.ruff.lint.per-file-ignores] -"__init__.py" = ["E402", "F401"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..2c0938cc --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,5 @@ +pytest==8.3.3 +pytest-workflow==2.1.0 +pytest-datadir==1.5.0 +black==24.8.0 +ruff==0.6.8 diff --git a/subworkflows/download_databases.nf b/subworkflows/download_databases.nf index f1f1347e..5ffb88d6 100644 --- a/subworkflows/download_databases.nf +++ b/subworkflows/download_databases.nf @@ -12,6 +12,7 @@ include { INTEPROSCAN_GETDB } from '../modules/local/interproscan_getdb' include { INTEPRO_ENTRY_LIST_GETDB } from '../modules/local/interpro_list_getdb' include { RFAM_GETMODELS } from '../modules/local/rfam_getmodels' include { BAKTA_GETDB } from '../modules/local/bakta_getdb' +include { PSEUDOFINDER_GETDB } from '../modules/local/pseudofinder_getdb' workflow DOWNLOAD_DATABASES { @@ -26,6 +27,7 @@ workflow DOWNLOAD_DATABASES { interpro_entry_list = channel.empty() eggnog_db = channel.empty() bakta_db = channel.empty() + pseudofinder_db = channel.empty() amrfinder_plus_dir = file("$params.dbs/amrfinder/") antismash_dir = file("$params.dbs/antismash") @@ -36,6 +38,7 @@ workflow DOWNLOAD_DATABASES { eggnog_data_dir = file("$params.dbs/eggnog") rfam_ncrna_models = file("$params.dbs/rfam_models/rfam_ncrna_cms") bakta_dir = file("$params.dbs/bakta") + pseudofinder_dir = file("$params.dbs/uniprot_sprot") if (amrfinder_plus_dir.exists()) { amrfinder_plus_db = tuple( @@ -126,6 +129,17 @@ workflow DOWNLOAD_DATABASES { ) } + if (pseudofinder_dir.exists()) { + log.info("Pseudofinder database exists, or at least the expected folder.") + pseudofinder_db = tuple( + pseudofinder_dir, + file("${pseudofinder_dir}/VERSION.txt", checkIfExists: true).text // the DB version + ) + } else { + PSEUDOFINDER_GETDB() + pseudofinder_db = PSEUDOFINDER_GETDB.out.pseudofinder_db.first() + } + if (params.bakta) { if (bakta_dir.exists()) { log.info("Bakta database exists, or at least the expected folders.") @@ -148,5 +162,6 @@ workflow DOWNLOAD_DATABASES { eggnog_db = eggnog_db rfam_ncrna_models = rfam_ncrna_models bakta_db = bakta_db + pseudofinder_db = pseudofinder_db } diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/scripts/__init__.py b/tests/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_inputs/deduplication_script_test_input.gff b/tests/scripts/data/deduplication_script_test_input.gff similarity index 100% rename from tests/test_inputs/deduplication_script_test_input.gff rename to tests/scripts/data/deduplication_script_test_input.gff diff --git a/tests/test_inputs/dummy_reference.gff b/tests/scripts/data/dummy_reference.gff similarity index 100% rename from tests/test_inputs/dummy_reference.gff rename to tests/scripts/data/dummy_reference.gff diff --git a/tests/scripts/data/pseudofinder_test_inputs/expected_processed_result.gff b/tests/scripts/data/pseudofinder_test_inputs/expected_processed_result.gff new file mode 100644 index 00000000..7afbee33 --- /dev/null +++ b/tests/scripts/data/pseudofinder_test_inputs/expected_processed_result.gff @@ -0,0 +1,4 @@ +##gff-version 3 +#!annotation-date 2024-12-10 16:01:58 +##sequence-region NZ_LN831051.1 1 2110968 +NZ_LN831051.1 pseudofinder gene 1425 2302 . + . note=Pseudogene candidate. Reason(s):Predicted fragmentation of a single gene.;locus_tag=GCF_001457635.1_1_pseudo_00001;old_locus_tag=GCF_001457635.1_00003,GCF_001457635.1_1_ign_00003,GCF_001457635.1_00004 diff --git a/tests/scripts/data/pseudofinder_test_inputs/prokka_compliant.gff b/tests/scripts/data/pseudofinder_test_inputs/prokka_compliant.gff new file mode 100644 index 00000000..d578e983 --- /dev/null +++ b/tests/scripts/data/pseudofinder_test_inputs/prokka_compliant.gff @@ -0,0 +1,10 @@ +##gff-version 3 +##sequence-region gnl|Prokka|GCF_001457635.1_1 1 2110968 +gnl|Prokka|GCF_001457635.1_1 prokka gene 558 758 . + . ID=GCF_001457635.1_00001_gene;locus_tag=GCF_001457635.1_00001 +gnl|Prokka|GCF_001457635.1_1 Prodigal:002006 CDS 558 758 . + 0 ID=GCF_001457635.1_00001;Parent=GCF_001457635.1_00001_gene;inference=ab initio prediction:Prodigal:002006;locus_tag=GCF_001457635.1_00001;product=hypothetical protein;protein_id=gnl|Prokka|GCF_001457635.1_00001 +gnl|Prokka|GCF_001457635.1_1 prokka gene 991 1263 . + . ID=GCF_001457635.1_00002_gene;locus_tag=GCF_001457635.1_00002 +gnl|Prokka|GCF_001457635.1_1 Prodigal:002006 CDS 991 1263 . + 0 ID=GCF_001457635.1_00002;Parent=GCF_001457635.1_00002_gene;inference=ab initio prediction:Prodigal:002006;locus_tag=GCF_001457635.1_00002;product=hypothetical protein;protein_id=gnl|Prokka|GCF_001457635.1_00002 +gnl|Prokka|GCF_001457635.1_1 prokka gene 1425 1559 . + . ID=GCF_001457635.1_00003_gene;Name=pgk_1;gene=pgk_1;locus_tag=GCF_001457635.1_00003 +gnl|Prokka|GCF_001457635.1_1 Prodigal:002006 CDS 1425 1559 . + 0 ID=GCF_001457635.1_00003;Parent=GCF_001457635.1_00003_gene;eC_number=2.7.2.3;Name=pgk_1;db_xref=COG:COG0126;gene=pgk_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q8DQX8;locus_tag=GCF_001457635.1_00003;product=Phosphoglycerate kinase;protein_id=gnl|Prokka|GCF_001457635.1_00003 +gnl|Prokka|GCF_001457635.1_1 prokka gene 1958 2302 . + . ID=GCF_001457635.1_00005_gene;Name=pgk_2;gene=pgk_2;locus_tag=GCF_001457635.1_00005 +gnl|Prokka|GCF_001457635.1_1 Prodigal:002006 CDS 1958 2302 . + 0 ID=GCF_001457635.1_00005;Parent=GCF_001457635.1_00005_gene;eC_number=2.7.2.3;Name=pgk_2;db_xref=COG:COG0126;gene=pgk_2;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q8DQX8;locus_tag=GCF_001457635.1_00005;product=Phosphoglycerate kinase;protein_id=gnl|Prokka|GCF_001457635.1_00005 diff --git a/tests/scripts/data/pseudofinder_test_inputs/prokka_standard.gff b/tests/scripts/data/pseudofinder_test_inputs/prokka_standard.gff new file mode 100644 index 00000000..f1bc8a25 --- /dev/null +++ b/tests/scripts/data/pseudofinder_test_inputs/prokka_standard.gff @@ -0,0 +1,7 @@ +##gff-version 3 +##sequence-region NZ_LN831051.1 1 2110968 +NZ_LN831051.1 Prodigal:002006 CDS 558 758 . + 0 ID=GCF_001457635.1_00001;inference=ab initio prediction:Prodigal:002006;locus_tag=GCF_001457635.1_00001;product=hypothetical protein +NZ_LN831051.1 Prodigal:002006 CDS 991 1263 . + 0 ID=GCF_001457635.1_00002;inference=ab initio prediction:Prodigal:002006;locus_tag=GCF_001457635.1_00002;product=hypothetical protein +NZ_LN831051.1 Prodigal:002006 CDS 1425 1559 . + 0 ID=GCF_001457635.1_00003;eC_number=2.7.2.3;Name=pgk_1;db_xref=COG:COG0126;gene=pgk_1;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q8DQX8;locus_tag=GCF_001457635.1_00003;product=Phosphoglycerate kinase +NZ_LN831051.1 Prodigal:002006 CDS 1958 2302 . + 0 ID=GCF_001457635.1_00004;eC_number=2.7.2.3;Name=pgk_2;db_xref=COG:COG0126;gene=pgk_2;inference=ab initio prediction:Prodigal:002006,similar to AA sequence:UniProtKB:Q8DQX8;locus_tag=GCF_001457635.1_00004;product=Phosphoglycerate kinase +NZ_LN831051.1 Prodigal:002006 CDS 2286 2615 . + 0 ID=GCF_001457635.1_00005;eC_number=2.7.2.3;Name=pgk_3;gene=pgk_3;inference=ab initio prediction:Prodigal:002006,protein motif:HAMAP:MF_00145;locus_tag=GCF_001457635.1_00005;product=Phosphoglycerate kinase diff --git a/tests/scripts/data/pseudofinder_test_inputs/pseudofinder_result.gff b/tests/scripts/data/pseudofinder_test_inputs/pseudofinder_result.gff new file mode 100644 index 00000000..3142d5aa --- /dev/null +++ b/tests/scripts/data/pseudofinder_test_inputs/pseudofinder_result.gff @@ -0,0 +1,4 @@ +##gff-version 3 +#!annotation-date 2024-12-10 16:01:58 +##sequence-region GCF_001457635.1_1 1 2110968 +GCF_001457635.1_1 pseudofinder gene 1425 2302 . + . note=Pseudogene candidate. Reason(s):Predicted fragmentation of a single gene.;locus_tag=GCF_001457635.1_1_pseudo_00001;old_locus_tag=GCF_001457635.1_00003,GCF_001457635.1_1_ign_00003,GCF_001457635.1_00005 diff --git a/tests/scripts/test_add_hypothetical_protein_descriptions.py b/tests/scripts/test_add_hypothetical_protein_descriptions.py index 71d140c4..daad9b74 100644 --- a/tests/scripts/test_add_hypothetical_protein_descriptions.py +++ b/tests/scripts/test_add_hypothetical_protein_descriptions.py @@ -1,254 +1,269 @@ -#!/bin/env python3 +#!/usr/bin/env python3 + +# Copyright 2024 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import copy -import os import re -import sys -import unittest -sys.path.append(os.path.sep.join(os.getcwd().split(os.path.sep)[:-2])) +import pytest from bin.add_hypothetical_protein_descriptions import ( - keep_or_move_to_note, - move_function_to_note, + GeneCaller, clean_up_function, - insert_product_source, - get_function, escape_reserved_characters, + get_function, + insert_product_source, + keep_or_move_to_note, + move_function_to_note, ) -class TestHypotheticalProteinAnnotation(unittest.TestCase): - def setUp(self): - self.eggnog_annot = {"my_id": "eggnog function"} - self.attributes_dict = { - "ID": "my_id", - "product": "hypothetical protein", - "protein_id": "protein id", - } - self.ipr_info = { - "my_id": { - "Family": { - "NCBIFam": { - "match": 0.80, - "ipr_desc": "NCBI Fam function", - "sig_desc": "sig desc", - "level": 1, - }, - "Pfam": { - "match": 0.90, - "ipr_desc": "Pfam Fam function", - "sig_desc": "sig desc", - "level": 0, - }, +@pytest.fixture +def setup_data(): + eggnog_annot = {"my_id": "eggnog function"} + attributes_dict = { + "ID": "my_id", + "product": "hypothetical protein", + "protein_id": "protein id", + } + ipr_info = { + "my_id": { + "Family": { + "NCBIFam": { + "match": 0.80, + "ipr_desc": "NCBI Fam function", + "sig_desc": "sig desc", + "level": 1, }, - "Domain": { - "CDD": { - "match": 1.00, - "ipr_desc": "CDD Domain function", - "sig_desc": "sig desc", - "level": 0, - } + "Pfam": { + "match": 0.90, + "ipr_desc": "Pfam Fam function", + "sig_desc": "sig desc", + "level": 0, }, - } + }, + "Domain": { + "CDD": { + "match": 1.00, + "ipr_desc": "CDD Domain function", + "sig_desc": "sig desc", + "level": 0, + } + }, } - self.ipr_memberdb_only = { - "my_id": { - "no_type": { - "NCBIFam": { - "match": 0.80, - "ipr_desc": "NCBI Fam function", - "sig_desc": "sig desc", - "level": 2, - }, - "Pfam": { - "match": 0.90, - "ipr_desc": "Pfam Fam function", - "sig_desc": "sig desc", - "level": 0, - }, + } + ipr_memberdb_only = { + "my_id": { + "no_type": { + "NCBIFam": { + "match": 0.80, + "ipr_desc": "NCBI Fam function", + "sig_desc": "sig desc", + "level": 2, }, - } - } - - def test_get_function_ncbifam_priority(self): - # Tests which function source is best available. In this test, NCBIFam has priority. - result = get_function( - "my_id", - self.attributes_dict, - self.eggnog_annot, - self.ipr_info, - self.ipr_memberdb_only, - ) - self.assertEqual(result, ("NCBI Fam function", "InterPro(NCBIFam)")) - - def test_get_function_level_priority(self): - # Tests which function source is the best one available. In this test, Pfam has priority. - ipr_test = copy.deepcopy(self.ipr_info) - ipr_test["my_id"]["Family"]["Pfam"].update({"level": 1}) - result = get_function( - "my_id", - self.attributes_dict, - self.eggnog_annot, - ipr_test, - self.ipr_memberdb_only, - ) - self.assertEqual(result, ("Pfam Fam function", "InterPro(Pfam)")) - - def test_get_function_level_eggnog(self): - # Tests which function source is the best one available. In this test, eggnog is the only result. - result = get_function( - "my_id", - self.attributes_dict, - self.eggnog_annot, - {}, - {}, - ) - self.assertEqual(result, ("eggnog function", "eggNOG")) - - def test_get_function_level_pfam_overtake(self): - # Tests which function source is the best one available. In this test, Pfam overtakes other annotations - # because it has priority. - ipr_test = copy.deepcopy(self.ipr_info) - del ipr_test["my_id"]["Family"]["NCBIFam"] - ipr_test["my_id"]["Family"].update({"CDD": ipr_test["my_id"]["Domain"]["CDD"]}) - result = get_function( - "my_id", - self.attributes_dict, - self.eggnog_annot, - ipr_test, - self.ipr_memberdb_only, - ) - self.assertEqual(result, ("Pfam Fam function", "InterPro(Pfam)")) - - def test_get_function_level_pfam_overtake_rejected(self): - # Tests which function source is the best one available. In this test, Pfam should not take over other - # annotations because it has a higher level. - ipr_test = copy.deepcopy(self.ipr_info) - del ipr_test["my_id"]["Family"]["NCBIFam"] - ipr_test["my_id"]["Family"].update({"CDD": ipr_test["my_id"]["Domain"]["CDD"]}) - ipr_test["my_id"]["Family"]["CDD"].update({"level": 1}) - result = get_function( - "my_id", - self.attributes_dict, - self.eggnog_annot, - ipr_test, - self.ipr_memberdb_only, - ) - self.assertEqual(result, ("CDD Domain function", "InterPro(CDD)")) - - def test_insert_product_source(self): - # Tests adding the product source field to the 9th column, after the product field - product_source = "UniFIRE" - result = insert_product_source(self.attributes_dict, product_source) - self.assertEqual( - result, - { - "ID": "my_id", - "product": "hypothetical protein", - "product_source": "UniFIRE", - "protein_id": "protein id", - }, - ) - - def test_escape_reserved_characters(self): - # Tests replacing special characters - func = "2,3-containing domain, functional protein; integrase" - result = escape_reserved_characters(func) - self.assertEqual( - result, "2%2C3-containing domain/ functional protein/ integrase" - ) - - def test_clean_up_function_domain(self): - # Description ends with "domain", need to add "-containing protein" - func = "ATPase, nucleotide binding domain" - result = clean_up_function(func) - self.assertEqual(result, "ATPase, nucleotide binding domain-containing protein") - - def test_clean_up_function_no_change(self): - # Description contains the word "protein", no need to change - func = "Ribosomal protein L23/L15e core domain superfamily" - result = clean_up_function(func) - self.assertEqual(result, func) - - def test_clean_up_function_add_protein(self): - # Description ends with "family" and doesn't contain the word protein, "protein" should be added - func = "Ribosomal L23/L15e core domain superfamily" - result = clean_up_function(func) - self.assertEqual(result, "Ribosomal L23/L15e core domain superfamily protein") - - def test_keep_or_move_to_note_sentence(self): - # Function is a sentence, should be moved to the note field - func = "This is a protein" - result = keep_or_move_to_note( - func, "eggNOG", {"ID": "123", "locus_tag": "locus_tag"} - ) - self.assertEqual( - result, - ( - "hypothetical protein", - "Prokka", - { - "ID": "123", - "locus_tag": "locus_tag", - "note": "eggNOG:This is a protein", + "Pfam": { + "match": 0.90, + "ipr_desc": "Pfam Fam function", + "sig_desc": "sig desc", + "level": 0, }, - ), - ) - - def test_keep_or_move_to_note_correct(self): - # Function is in the correct format, keep it as is. - func = "ABC-hydrolase interfering protein with extra large domain" - result = keep_or_move_to_note( - func, "eggNOG", {"ID": "123", "locus_tag": "locus_tag"} - ) - self.assertEqual( - result, - ( - func, - "eggNOG", - {"ID": "123", "locus_tag": "locus_tag"}, - ), - ) - - def test_move_function_to_note_with_existing_note(self): - # Tests appending function description to the existing text in the note field - col9_with_note = "ID=BU_ATCC8492_03165;locus_tag=BU_ATCC8492_03165;note=UPF0056 inner membrane protein YhgN;product=NAAT family transporter;product_source=NCBIfam;eggNOG=585543.HMPREF0969_01099" - col9_dict = dict( - re.split(r"(?