bin/create_table.py

#!/usr/bin/env python3

import argparse
import os
import json
import sys
import csv
import re
import math


def parse_args(args=None):
    Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."

    parser = argparse.ArgumentParser(description=Description)
    parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.")
    parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.")
    parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
    parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
    parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
    parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
    parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
    parser.add_argument("--outcsv", required=True, help="Output CSV file.")
    parser.add_argument("--version", action="version", version="%(prog)s 3.1")
    return parser.parse_args(args)


def make_dir(path):
    """
    Creates a directory if it doesn't exist.

    Parameters:
    path (str): Path of the directory to be created.
    """
    if len(path) > 0:
        os.makedirs(path, exist_ok=True)


# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names
# We usually don't want it in the final output
def remove_sample_T_suffix(name):
    """
    Removes the suffix like "_T1", "_T2", etc., from sample names.

    Parameters:
    name (str): Sample name to be processed.

    Returns:
    str: Sample name with the suffix removed.
    """
    return re.sub(r"_T\d+", "", name)


def ncbi_stats(genome_in, seq_in, writer):
    """
    Extracts and writes assembly information and statistics from genome and
    sequence JSON files to a CSV file.

    Parameters:
    genome_in (str): Path to the NCBI genome summary JSON file.
    seq_in (str): Path to the NCBI sequence summary JSON file.
    writer (csv.writer): CSV writer object to write the extracted data.
    """
    with open(genome_in, "r") as fin1:
        data = json.load(fin1)
    data = data.get("reports", [{}])[0]

    with open(seq_in, "r") as fin2:
        seq = json.load(fin2).get("reports", [])

    info = data.get("assembly_info", {})
    attr = info.get("biosample", {}).get("attributes", [])
    stats = data.get("assembly_stats", {})
    organism = data.get("organism", {})

    # Write assembly information
    writer.writerow(["##Assembly_Information"])
    writer.writerow(["Accession", data.get("accession", math.nan)])
    writer.writerow(["Common_Name", organism.get("common_name", math.nan)])
    writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)])
    tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid")
    writer.writerow(["ToL_ID", tol_id if tol_id else math.nan])
    writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)])
    writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)])
    writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)])
    life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage")
    writer.writerow(["Life_Stage", life_stage if life_stage else math.nan])
    tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue")
    writer.writerow(["Tissue", tissue if tissue else math.nan])
    sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex")
    writer.writerow(["Sex", sex if sex else math.nan])

    # Write assembly statistics
    writer.writerow(["##Assembly_Statistics"])
    writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)])
    writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)])
    writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)])
    writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)])
    writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)])
    writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)])
    writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)])

    chromosome_header = False
    for mol in seq:
        if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear":
            if not chromosome_header:
                writer.writerow(["##Chromosome", "Length", "GC_Percent", "Accession"])
                chromosome_header = True
            writer.writerow(
                [
                    mol.get("chr_name", math.nan),
                    "%.2f" % (mol.get("length", 0) / 1000000) if mol.get("length") is not None else math.nan,
                    mol.get("gc_percent", math.nan),
                    mol.get("genbank_accession"),
                ]
            )

    organelle_header = False
    for mol in seq:
        if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear":
            if not organelle_header:
                writer.writerow(["##Organelle", "Length", "GC_Percent", "Accession"])
                organelle_header = True
            writer.writerow(
                [
                    mol.get("assigned_molecule_location_type", math.nan),
                    "%.2f" % (mol.get("length", 0) / 1000) if mol.get("length") is not None else math.nan,
                    mol.get("gc_percent", math.nan),
                    mol.get("genbank_accession"),
                ]
            )


def extract_busco(file_in, writer):
    """
    Extracts BUSCO information from a JSON file and writes it to a CSV file.

    Parameters:
    file_in (str): Path to the BUSCO summary JSON file.
    writer (csv.writer): CSV writer object to write the extracted data.
    """
    with open(file_in, "r") as fin:
        data = json.load(fin)

    lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan)
    results_summary = data.get("results", {}).get("one_line_summary", math.nan)
    results_complete = data.get("results", {}).get("Complete", math.nan)
    results_single_copy = data.get("results", {}).get("Single copy", math.nan)
    results_duplicated = data.get("results", {}).get("Multi copy", math.nan)
    results_n_markers = data.get("results", {}).get("n_markers", math.nan)

    writer.writerow(["##BUSCO", lineage_dataset_name])
    writer.writerow(["Summary", results_summary])
    writer.writerow(["Complete", results_complete])
    writer.writerow(["Single", results_single_copy])
    writer.writerow(["Duplicated", results_duplicated])
    writer.writerow(["Number_Orthologs", f"{results_n_markers:,}"])


def extract_pacbio(qv, completeness, writer):
    """
    Extracts QV and completeness information from TSV files and writes it to a
    CSV file.

    NOTE: completeness and qv files have to be from matching specimen names

    Parameters:
    qv (list): List of paths to one or more QV TSV files.
    completeness (list): List of paths to completeness stats TSV files.
    writer (csv.writer): CSV writer object to write the extracted data.
    """
    qval = 0
    qv_name = None
    for f in qv:
        with open(f, "r") as fin:
            data = csv.DictReader(fin, delimiter="\t")
            for row in data:
                if float(row["QV"]) > qval:
                    qval = float(row["QV"])
                    qv_name = remove_sample_T_suffix(os.path.basename(f).removesuffix(".qv"))
    assert qv_name is not None, "No QV values found in %s" % qv

    # The completeness has to be from the same specimen as the QV value
    matching_completeness_files = []
    for h in completeness:
        comp_name = remove_sample_T_suffix(os.path.basename(h).removesuffix(".completeness.stats"))
        if comp_name == qv_name:
            matching_completeness_files.append(h)
    assert matching_completeness_files, "No completeness files (%s) match for %s" % (completeness, qv_name)

    comp = None
    for h in matching_completeness_files:
        with open(h, "r") as fin:
            data = csv.DictReader(fin, delimiter="\t")
            for row in data:
                comp = float(row["% Covered"])
    assert comp is not None, "No completeness values found in %s" % matching_completeness_files

    writer.writerow(["##MerquryFK", qv_name])
    writer.writerow(["QV", qval])
    writer.writerow(["Completeness", comp])


def extract_mapped(sample, file_in, writer):
    """
    Extracts mapping information from a flagstat file and writes it to a CSV
    file.

    Parameters:
    sample (str): Sample ID used for the HiC contact maps.
    file_in (str): Path to the HiC flagstat file created by Samtools.
    writer (csv.writer): CSV writer object to write the extracted data.
    """
    writer.writerow(["##HiC", remove_sample_T_suffix(sample)])
    with open(file_in, "r") as fin:
        for line in fin:
            if "primary mapped" in line:
                writer.writerow(["Primary_Mapped", re.search(r"\((.*?) :", line).group(1)])


def main(args=None):
    args = parse_args(args)

    out_dir = os.path.dirname(args.outcsv)
    make_dir(out_dir)

    with open(args.outcsv, "w") as fout:
        writer = csv.writer(fout)
        ncbi_stats(args.genome, args.sequence, writer)
        if args.busco is not None:
            extract_busco(args.busco, writer)
        if args.qv and args.completeness is not None:
            extract_pacbio(args.qv, args.completeness, writer)
        if args.hic is not None:
            for hic, flagstat in zip(args.hic, args.flagstat):
                extract_mapped(hic, flagstat, writer)


if __name__ == "__main__":
    sys.exit(main())