Merge pull request #53 from EBI-Metagenomics/dev

Dev
EBI-Metagenomics · Dec 20, 2024 · 0d08e4f · 0d08e4f
2 parents 5fe9266 + a6c0d9a
commit 0d08e4f
Show file tree

Hide file tree

Showing 86 changed files with 1,973 additions and 641 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,8 +30,8 @@ jobs:
       - name: Check out pipeline code
         uses: actions/checkout@v3
 
-      - name: Install Nextflow
-        uses: nf-core/setup-nextflow@v1
+      - name: Set up Nextflow
+        uses: nf-core/setup-nextflow@v2
         with:
           version: "${{ matrix.NXF_VER }}"
 

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install nf-core
+          pip install nf-core==2.14.1
 
       - name: Run nf-core lint
         env:

diff --git a/.github/workflows/pytest_ci.yml b/.github/workflows/pytest_ci.yml
@@ -0,0 +1,32 @@
+name: Python Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main, dev]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: [3.9]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+
+      - name: Run tests
+        run: |
+          pytest
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,46 @@
+# Ignore Nextflow-specific files, including logs, cache, and temporary files
 .nextflow*
+
+# Ignore Nextflow's work directory where intermediate files are stored
 work/
-data/
+
+# Ignore results directory, as this is typically generated output that can be large and recreated
 results/
+
+# MacOS specific hidden file that stores folder view settings
 .DS_Store
+
 testing/
 testing*
-*.pyc
+
 dbs/
+
+node_modules/
+
+# Optional: ignore any temporary files created by Python or text editors
+__pycache__/  # Python cache directory
+*.pyc
+*.pyo  # Python optimized bytecode files
+*.pkl  # Pickle files (often generated during data processing)
+
+# Ignore any virtual environment directories used to isolate Python dependencies
+venv/
+env/
+*.venv/
+
+# Ignore Jupyter Notebook checkpoints, if notebooks are used for analysis or reporting
+.ipynb_checkpoints/
+
+# Ignore any temporary, swap, or backup files created by editors like Vim or Emacs
+*~
+*.swp
+*.swo
+*.bak
+
+.coverage
+htmlcov/
+*.cover
+reports/
+trace/
+.cache/
+logs/
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,4 +1,8 @@
 repository_type: pipeline
+org_path: ebi-metagenomics
+template:
+  prefix: ebi-metagenomics
+    - github_badges
 lint:
   files_exist:
     - CODE_OF_CONDUCT.md
@@ -26,10 +30,12 @@ lint:
     - docs/images/nf-core-mettannotator_logo_light.png
     - docs/images/nf-core-mettannotator_logo_dark.png
     - .github/ISSUE_TEMPLATE/bug_report.yml
-  nextflow_config: False
-    - params.custom_config_version
-    - params.custom_config_base
-    - manifest.name
-    - manifest.homePage
+    - .github/workflows/linting.yml
+    - .github/CONTRIBUTING.md
+    - .github/workflows/branch.yml
+    - .github/workflows/linting_comment.yml
+    - .github/PULL_REQUEST_TEMPLATE.md
+    - .gitignore
+  nextflow_config: false
   multiqc_config:
     - report_comment
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,5 +1,20 @@
 repos:
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: "v2.7.1"
+    rev: "v4.0.0-alpha.8"
     hooks:
       - id: prettier
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 24.8.0
+    hooks:
+      - id: black
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.8
+    hooks:
+      - id: ruff
+        args: [--fix]
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,12 @@
+line-length = 120
+target-version = "py38"
+cache-dir = "~/.cache/ruff"
+
+[lint]
+select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"]
+
+[lint.isort]
+known-first-party = ["nf_core"]
+
+[lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -62,6 +62,10 @@
 
   > Sanchez S, Rogers JD, Rogers AB, Nassar M, McEntyre J, Welch M, Hollfelder F, Finn RD. Expansion of novel biosynthetic gene clusters from diverse environments using SanntiS. bioRxiv 2023.05.23.540769; doi: https://doi.org/10.1101/2023.05.23.540769
 
+- [Pseudofinder](https://doi.org/10.1093/molbev/msac153)
+
+  > Syberg-Olsen MJ, Garber AI, Keeling PJ, McCutcheon JP, Husnik F. Pseudofinder: Detection of Pseudogenes in Prokaryotic Genomes. Mol Biol Evol. 2022 Jul 2;39(7):msac153. doi: 10.1093/molbev/msac153. PMID: 35801562; PMCID: PMC9336565.
+
 - [run_dbCAN](https://pubmed.ncbi.nlm.nih.gov/37125649/)
 
   > Zheng J, Ge Q, Yan Y, Zhang X, Huang L, Yin Y. dbCAN3: automated carbohydrate-active enzyme and substrate annotation. Nucleic Acids Res. 2023 Jul 5;51(W1):W115-W121. doi: 10.1093/nar/gkad328. PMID: 37125649; PMCID: PMC10320055.

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@
 - [ Usage ](#usage)
 - [ Test ](#test)
 - [ Outputs ](#out)
+- [Preparing annotations for ENA or GenBank submission](#submission)
 - [ Mobilome annotation ](#mobilome)
 - [ Credits ](#credit)
 - [ Contributions and Support ](#contribute)
@@ -39,6 +40,8 @@ The workflow uses the following tools and databases:
 | [Prokka](https://github.com/tseemann/prokka)                                                     | 1.14.6                                        | CDS calling and functional annotation (default)                                                                        |
 | [Bakta](https://github.com/oschwengers/bakta)                                                    | 1.9.3                                         | CDS calling and functional annotation (if --bakta flag is used)                                                        |
 | [Bakta db](https://zenodo.org/record/10522951/)                                                  | 2024-01-19 with AMRFinderPlus DB 2024-01-31.1 | Bakta DB (when Bakta is used as the gene caller)                                                                       |
+| [Pseudofinder](https://github.com/filip-husnik/pseudofinder)                                     | v1.1.0                                        | Identification of possible pseudogenes                                                                                 |
+| [Swiss-Prot](https://www.uniprot.org/help/downloads)                                             | 2024_06                                       | Database for Pseudofinder                                                                                              |
 | [InterProScan](https://www.ebi.ac.uk/interpro/about/interproscan/)                               | 5.62-94.0                                     | Protein annotation (InterPro, Pfam)                                                                                    |
 | [eggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper)                                       | 2.1.11                                        | Protein annotation (eggNOG, KEGG, COG, GO-terms)                                                                       |
 | [eggNOG DB](http://eggnog6.embl.de/download/)                                                    | 5.0.2                                         | Database for eggNOG-mapper                                                                                             |
@@ -89,7 +92,8 @@ The pipeline needs reference databases in order to work, they take roughly 180G.
 | interproscan        | 45G  |
 | interpro_entry_list | 2.6M |
 | rfam_models         | 637M |
-| total               | 180G |
+| pseudofinder        | 273M |
+| total               | 182G |
 
 `mettannotator` has an automated mechanism to download the databases using the `--dbs <db_path>` flag. When this flag is provided, the pipeline inspects the folder to verify if the required databases are already present. If any of the databases are missing, the pipeline will automatically download them.
 
@@ -177,8 +181,12 @@ Note, that by default the script uses FASTA file names as prefixes and truncates
 
 Running `mettannotator` with the `--help` option will pull the repository and display the help message:
 
+> [!NOTE]
+> We use the `-latest` flag with the `nextflow run` command, which ensures that the latest available version of the pipeline is pulled.
+> If you encounter any issues with the `nextflow run` command, please refer to the [Nextflow documentation](https://www.nextflow.io/docs/latest/reference/cli.html#run).
+
 ```angular2html
-nextflow run ebi-metagenomics/mettannotator/main.nf --help
+$ nextflow run -latest ebi-metagenomics/mettannotator/main.nf --help
 N E X T F L O W  ~  version 23.04.3
 Launching `mettannotator/main.nf` [disturbed_davinci] DSL2 - revision: f2a0e51af6
 
@@ -229,6 +237,8 @@ Reference databases
                                                database for version 4.0 on this ftp location:
                                                ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/tool-dbs/dbcan/dbcan_4.0.tar.gz
   --dbcan_db_version                 [string]  The dbCAN reference database version. [default: 4.1.3_V12]
+  --pseudofinder_db                  [string]  Pseudofinder reference database. Mettannotator uses SwissProt as the database for Pseudofinder.
+  --pseudofinder_db_version          [string]  SwissProt version. [default: 2024_06]
 
 Generic options
   --multiqc_methods_description      [string]  Custom MultiQC yaml file containing HTML including a methods description.
@@ -259,17 +269,25 @@ nextflow run ebi-metagenomics/mettannotator \
    --dbs <PATH/TO/WHERE/DBS/WILL/BE/SAVED>
 ```
 
-> **Warning:**
+> [!WARNING]
 > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
 > provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
 > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
 
+#### Running the pipeline from the source code
+
+If the Nextflow integration with Git does not work, users can download the tarball from the releases page. After extracting the tarball, the pipeline can be run directly by executing the following command:
+
+```bash
+$ nextflow run path-to-source-code/main.nf --help
+```
+
 #### Local execution
 
 The pipeline can be run on a desktop or laptop, with the caveat that it will take a few hours to complete depending on the resources. There is a local profile in the Nextflow config that limits the total resources the pipeline can use to 8 cores and 12 GB of RAM. In order to run it (Docker or Singularity are still required):
 
 ```bash
-nextflow run ebi-metagenomics/mettannotator \
+nextflow run -latest ebi-metagenomics/mettannotator \
    -profile local,<docker or singulairty> \
    --input assemblies_sheet.csv \
    --outdir <OUTDIR> \
@@ -302,7 +320,7 @@ To run the pipeline using a test dataset, execute the following command:
 ```bash
 wget https://raw.githubusercontent.com/EBI-Metagenomics/mettannotator/master/tests/test.csv
 
-nextflow run ebi-metagenomics/mettannotator \
+nextflow run -latest ebi-metagenomics/mettannotator \
    -profile <docker/singularity/...> \
    --input test.csv \
    --outdir <OUTDIR> \
@@ -331,6 +349,7 @@ The output folder structure will look as follows:
    │  ├─interproscan
    │  ├─merged_gff
    │  ├─prokka
+   │  ├─pseudofinder
    │  └─unifire
    ├─mobilome
    │  └─crisprcas_finder
@@ -429,12 +448,49 @@ The following logic is used by `mettannotator` to fill out the `product` field i
 
 If the pipeline is executed with the `--fast` flag, only the output of eggNOG-mapper is used to determine the product of proteins that were labeled as hypothetical by the gene caller.
 
+#### Detection of pseudogenes and spurious ORFs
+
+`mettannotator` uses several approaches to detect pseudogenes and spurious ORFs:
+
+- If Bakta is used as the initial annotation tool, `mettannotator` will inherit the pseudogene labels assigned by Bakta.
+- `mettannotator` runs Pseudofinder and labels genes that Pseudofinder predicts to be pseudogenes by adding `"pseudo=true"` to the 9th column of the final merged GFF file. If there is a disagreement between Pseudofinder and Bakta and one of the tools calls a gene a pseudogene, it will be labeled as a pseudogene.
+- AntiFam, which is a part of InterPro, is used to identify potential spurious ORFs. If an ORF has an AntiFam hit, `mettannotator` will remove it from the final merged GFF file. These ORFs will still appear in the raw outputs of Bakta/Prokka and may appear in other tool outputs.
+
+`mettannotator` produces a report file which is located in the `merged_gff` folder and includes a list of CDS with AntiFam hits and pseudogenes. For each pseudogene, the report shows which tool predicted it.
+
 ### Contents of the tool output folders
 
 The output folders of each individual tool contain select output files of the third-party tools used by `mettannotator`. For file descriptions, please refer to the tool documentation. For some tools that don't output a GFF, `mettannotator` converts the output into a GFF.
 
 Note: if the pipeline completed without errors but some of the tool-specific output folders are empty, those particular tools did not generate any annotations to output.
 
+<a name="submission"></a>
+
+## Preparing annotations for ENA or GenBank submission
+
+`mettannotator` produces a final annotation file in GFF3 format. To submit the annotations to data archives, it is first necessary to convert the GFF3 file into the required format, using third-party tools available. `mettannotator` outputs a specially formatted GFF3 file, named `<prefix>_submission.gff` to be used with converters.
+
+### ENA
+
+ENA accepts annotations in the EMBL flat-file format.
+Please use [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) to perform the conversion; the repository includes detailed instructions. The two files required for conversion are:
+
+- the genome FASTA file
+- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`
+
+Please note that it is necessary to register the project and locus tags in ENA prior to conversion. Follow links in the [EMBLmyGFF3](https://github.com/NBISweden/EMBLmyGFF3) repository for more details.
+
+### GenBank
+
+To convert annotations for GenBank submission, please use [table2asn](https://www.ncbi.nlm.nih.gov/genbank/table2asn/).
+Three files are required:
+
+- the genome FASTA file
+- `<mettannotator_results_folder>/<prefix>/functional_annotation/merged_gff/<prefix>_submission.gff`
+- Submission template file (can be generated [here](https://submit.ncbi.nlm.nih.gov/genbank/template/submission/))
+
+More instructions on running `table2asn` are available via [GenBank](https://www.ncbi.nlm.nih.gov/genbank/genomes_gff/).
+
 <a name="mobilome"></a>
 
 ## Mobilome annotation

diff --git a/bin/__init__.py b/bin/__init__.py