EBI-Metagenomics · tgurbich · Nov 16, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 20, 2023
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -9,11 +9,24 @@ report_section_order:
   "ebi-metagenomics-mettannotator-summary":
     order: -1002
 
+run_modules:
+  - quast
+  - prokka
+  - custom_content
+
 top_modules:
   - quast
+  - prokka
+
+prokka_table: true
+prokka_fn_snames: true
 
 sp:
   quast_config:
     fn: "*.tsv"
 
 export_plots: true
+
+## Prettification
+custom_logo_url: https://github.com/ebi-metagenomics/mettannotator
+custom_logo_title: "ebi-metagenomics/mettannotator"
diff --git a/bin/annotate_gff.py b/bin/annotate_gff.py
@@ -32,7 +32,8 @@ def get_iprs(ipr_annot):
                 iprs[protein][0].add(pfam)
             if len(cols) > 12:
                 ipr = cols[11]
-                iprs[protein][1].add(ipr)
+                if not ipr == "-":
+                    iprs[protein][1].add(ipr)
     return iprs
 
 
@@ -49,22 +50,27 @@ def get_eggnog(eggnog_annot):
                 eggnog = [cols[1]]
                 try:
                     cog = cols[eggnog_fields["cog_func"]]
-                    cog = cog.split()
+                    cog = list(cog)
                     if len(cog) > 1:
                         cog = ["R"]
                 except Exception:
                     cog = ["NA"]
                 kegg = cols[eggnog_fields["KEGG_ko"]].split(",")
-                eggnogs[protein] = [eggnog, cog, kegg]
+                go = cols[eggnog_fields["GOs"]]
+                eggnogs[protein] = [eggnog, cog, kegg, go]
     return eggnogs
 
 
 def get_eggnog_fields(line):
     cols = line.strip().split("\t")
+    try:
+        index_of_go = cols.index("GOs")
+    except ValueError:
+        sys.exit("Cannot find the GO terms column.")
     if cols[8] == "KEGG_ko" and cols[15] == "CAZy":
-        eggnog_fields = {"KEGG_ko": 8, "cog_func": 20}
+        eggnog_fields = {"KEGG_ko": 8, "cog_func": 20, "GOs": index_of_go}
     elif cols[11] == "KEGG_ko" and cols[18] == "CAZy":
-        eggnog_fields = {"KEGG_ko": 11, "cog_func": 6}
+        eggnog_fields = {"KEGG_ko": 11, "cog_func": 6, "GOs": index_of_go}
     else:
         sys.exit("Cannot parse eggNOG - unexpected field order or naming")
     return eggnog_fields
@@ -195,9 +201,7 @@ def get_amr(amr_file):
 
 
 def add_gff(in_gff, eggnog_file, ipr_file, sanntis_file, amr_file):
-    eggnogs = {}
-    if eggnog_file:
-        eggnogs = get_eggnog(eggnog_file)
+    eggnogs = get_eggnog(eggnog_file)
     iprs = get_iprs(ipr_file)
     sanntis_bgcs = get_sanntis(sanntis_file, in_gff)
     amr_annotations = {}
@@ -226,6 +230,8 @@ def add_gff(in_gff, eggnog_file, ipr_file, sanntis_file, amr_file):
                                     added_annot[protein]["COG"] = a
                                 elif pos == 3:
                                     added_annot[protein]["KEGG"] = a
+                                elif pos == 4:
+                                    added_annot[protein]["ontology_term"] = a
                     except Exception:
                         pass
                     try:
@@ -259,7 +265,8 @@ def add_gff(in_gff, eggnog_file, ipr_file, sanntis_file, amr_file):
                         if a == "AMR":
                             cols[8] = "{};{}".format(cols[8], value)
                         else:
-                            cols[8] = "{};{}={}".format(cols[8], a, value)
+                            if not value == "-":
+                                cols[8] = "{};{}={}".format(cols[8], a, value)
                     line = "\t".join(cols)
             out_gff.append(line)
     return out_gff
@@ -380,13 +387,13 @@ def add_ncrnas_and_crispr_to_gff(gff_outfile, ncrnas, crispr_annotations, res):
     parser.add_argument(
         "-i",
         dest="ips",
-        help="InterproScan annontations results for the cluster rep",
+        help="InterproScan annotations results for the cluster rep",
         required=True,
     )
     parser.add_argument(
         "-e",
-        dest="eggnong",
-        help="eggnog annontations for the clutser repo",
+        dest="eggnog",
+        help="eggnog annotations for the cluster repo",
         required=False,
     )
     parser.add_argument(
@@ -416,7 +423,7 @@ def add_ncrnas_and_crispr_to_gff(gff_outfile, ncrnas, crispr_annotations, res):
 
     extended_gff = add_gff(
         in_gff=gff,
-        eggnog_file=args.eggnong,
+        eggnog_file=args.eggnog,
         ipr_file=args.ips,
         sanntis_file=args.sanntis,
         amr_file=args.amr,

diff --git a/bin/prepare_unirule_input.py b/bin/prepare_unirule_input.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import os
+import sys
+
+logging.basicConfig(level=logging.INFO)
+
+
+def main(infile, outdir):
+    taxid = assign_taxid(infile)
+    check_dir(outdir)
+    outfile = "proteins.fasta"
+    outpath = os.path.join(outdir, outfile)
+    with open(outpath, "w") as file_out, open(infile, "r") as file_in:
+        for line in file_in:
+            if line.startswith(">"):
+                formatted_line = reformat_line(line, taxid)
+                file_out.write(formatted_line)
+            else:
+                file_out.write(line)
+
+
+def check_dir(directory_path):
+    if not os.path.exists(directory_path):
+        try:
+            os.makedirs(directory_path)
+        except OSError as e:
+            logging.error(f"Error: Failed to create directory '{directory_path}'. {e}")
+
+
+def reformat_line(line, taxid):
+    line = line.lstrip('>').strip()
+    id, description = line.split(maxsplit=1)
+    if taxid == "820":
+        sp_name = "Bacteroides uniformis"
+    elif taxid == "821":
+        sp_name = "Phocaeicola vulgatus"
+    elif taxid == "46503":
+        sp_name = "Parabacteroides merdae"
+    else:
+        raise ValueError("Unknown species")
+    formatted_line = ">tr|{id}|{description} OS={sp_name} OX={taxid}\n".format(id=id, description=description,
+                                                                               sp_name=sp_name, taxid=taxid)
+    return formatted_line
+
+
+def assign_taxid(infile):
+    try:
+        with open(infile, 'r') as file:
+            # Read the first line
+            first_line = file.readline().strip()
+            species_code = first_line[1:3]
+
+            # Assign taxid based on species code
+            if species_code == "BU":
+                taxid = "820"
+            elif species_code == "PV":
+                taxid = "821"
+            elif species_code == "PM":
+                taxid = "46503"
+            else:
+                raise ValueError("Unknown species")
+            return taxid
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        exit(1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "The script reformats the fasta faa file to prepare it for UniRule."
+        )
+    )
+    parser.add_argument(
+        "-i",
+        dest="infile",
+        required=True,
+        help="Input protein fasta file.",
+    )
+    parser.add_argument(
+        "-o",
+        dest="outdir",
+        required=True,
+        help=(
+            "Path to the folder where the output will be saved to."
+        ),
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(
+        args.infile,
+        args.outdir,
+    )
diff --git a/bin/process_dbcan_result.py b/bin/process_dbcan_result.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import os
+import sys
+
+logging.basicConfig(level=logging.INFO)
+
+
+def main(input_folder, outfile, dbcan_version):
+    if not check_folder_completeness(input_folder):
+        sys.exit("Missing dbCAN outputs. Exiting.")
+    substrates = load_substrates(input_folder)
+    cgc_locations = load_cgcs(input_folder)
+    print_gff(input_folder, outfile, dbcan_version, substrates, cgc_locations)
+
+
+def load_cgcs(input_folder):
+    cgc_locations = dict()
+    with open(os.path.join(input_folder, "cgc_standard.out")) as file_in:
+        for line in file_in:
+            if not line.startswith("CGC#"):
+                cgc, _, contig, _, start, end, _, _ = line.strip().split("\t")
+                if cgc in cgc_locations:
+                    if cgc_locations[cgc]["start"] > int(start):
+                        cgc_locations[cgc]["start"] = int(start)
+                    if cgc_locations[cgc]["end"] < int(end):
+                        cgc_locations[cgc]["end"] = int(end)
+                else:
+                    cgc_locations[cgc] = {"start": int(start),
+                                          "end": int(end),
+                                          "contig": contig}
+    return cgc_locations
+
+
+def print_gff(input_folder, outfile, dbcan_version, substrates, cgc_locations):
+    with open(outfile, "w") as file_out:
+        file_out.write("##gff-version 3\n")
+        cgcs_printed = list()
+        with open(os.path.join(input_folder, "cgc_standard.out")) as file_in:
+            for line in file_in:
+                if not line.startswith("CGC#"):
+                    cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = line.strip().split("\t")
+                    if not cgc in cgcs_printed:
+                        substrate = substrates[cgc] if cgc in substrates else "substrate_dbcan_pul=N/A;substrate_ecami=N/A"
+                        file_out.write("{}\tdbCAN:{}\tpredicted PUL\t{}\t{}\t.\t.\t.\tID={};{}\n".format(
+                            contig, dbcan_version, cgc_locations[cgc]["start"], cgc_locations[cgc]["end"], cgc,
+                            substrate))
+                        cgcs_printed.append(cgc)
+                    file_out.write("{}\tdbCAN:{}\t{}\t{}\t{}\t.\t{}\t.\tID={};Parent={},protein_family={}\n".format(
+                        contig, dbcan_version, gene_type, start, end, strand, prot_id, cgc, protein_fam))
+
+
+def load_substrates(input_folder):
+    substrates = dict()
+    with open(os.path.join(input_folder, "sub.prediction.out"), "r") as file_in:
+        for line in file_in:
+            if not line.startswith("#"):
+                parts = line.strip().split("\t")
+                cgc = parts[0].split("|")[1]
+                try:
+                    substrate_pul = parts[2]
+                except IndexError:
+                    substrate_pul = "N/A"
+                try:
+                    substrate_ecami = parts[5]
+                except IndexError:
+                    substrate_ecami = "N/A"
+                if not substrate_pul:
+                    substrate_pul = "N/A"
+                if not substrate_ecami:
+                    substrate_ecami = "N/A"
+                substrates[cgc] = "substrate_dbcan_pul={};substrate_ecami={}".format(substrate_pul, substrate_ecami)
+    return substrates
+
+
+def check_folder_completeness(input_folder):
+    status = True
+    for file in ["cgc_standard.out", "overview.txt", "sub.prediction.out"]:
+        if not os.path.exists(os.path.join(input_folder, file)):
+            logging.error("File {} does not exist.".format(file))
+            status = False
+    return status
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "The script takes dbCAN output and parses it to create a standalone GFF."
+        )
+    )
+    parser.add_argument(
+        "-i",
+        dest="input_folder",
+        required=True,
+        help="Path to the folder with dbCAN results.",
+    )
+    parser.add_argument(
+        "-o",
+        dest="outfile",
+        required=True,
+        help=(
+            "Path to the output file."
+        ),
+    )
+    parser.add_argument(
+        "-v",
+        dest="dbcan_ver",
+        required=True,
+        help=(
+            "dbCAN version used."
+        ),
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(
+        args.input_folder,
+        args.outfile,
+        args.dbcan_ver
+    )