Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new command 'make label' assign a label to query draft assembly based on the best hits from COBS #263

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -8,4 +8,6 @@ intermediate/*/*.fa
intermediate/*/*.sam
intermediate/*/*.txt
intermediate/*/*.gz
intermediate/*/labels/*
.vscode/
*.tmp
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -30,9 +30,10 @@ DOWNLOAD_PARAMS=--cores $(MAX_DOWNLOAD_THREADS) -j $(MAX_DOWNLOAD_THREADS) --res
all: ## Run everything (the default rule)
make download
make match
make label
make map

DIFF_CMD=diff -q <(gunzip --stdout output/reads_1___reads_2___reads_3___reads_4.sam_summary.gz | cut -f -3) <(xzcat data/reads_1___reads_2___reads_3___reads_4.sam_summary.xz | cut -f -3)
DIFF_CMD=diff -q <(gunzip --stdout output/all_queries.sam_summary.gz | cut -f -3) <(xzcat data/all_queries.sam_summary.xz | cut -f -3)

test: ## Quick test using 3 batches
snakemake download $(SMK_PARAMS) $(DOWNLOAD_PARAMS) --config batches=data/batches_small.txt # download is not benchmarked
@@ -49,6 +50,8 @@ test: ## Quick test using 3 batches
exit 1;\
fi

# echo gunzip --stdout output/all_queries.sam_summary.gz | cut -f -3 | less;

help: ## Print help messages
@echo -e "$$(grep -hE '^\S*(:.*)?##' $(MAKEFILE_LIST) \
| sed \
@@ -60,6 +63,7 @@ help: ## Print help messages
| column -c2 -t -s : )"

clean: ## Clean intermediate search files
rm -rfv intermediate/04_filter/labels
rm -fv intermediate/*/*
rm -rfv logs
rm -fv output/*
@@ -91,6 +95,8 @@ match: ## Match queries using COBS (queries -> candidates)
map: ## Map candidates to assemblies (candidates -> alignments)
scripts/benchmark.py --log logs/benchmarks/map_$(DATETIME).txt "snakemake map $(SMK_PARAMS)"

label: ## Label assemblies using labels from assemblies retrieved by querying COBS indexes
scripts/benchmark.py --log logs/benchmarks/label_$(DATETIME).txt "snakemake label $(SMK_PARAMS)"
###############
## Reporting ##
###############
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -218,6 +218,17 @@ Simply run `make`, which will execute Snakemake with the corresponding
parameters. If you want to run the pipeline step by step, run `make match`
followed by `make map`.


> **Optional**
> replace `make match` with `make label` (running `make label` right after `make match` will also work).
> This will use the outputs from `make match` to generate the labels (at species taxonomic level) of the matches.
> The labels are saved here: `intermediate/04_filter/labels`, and there you will find
> * CSV: in each row the first element is the query-id,
the other values correspond to the labels of the filtered
`intermediate/04_filter/{name}.fa`, in the same order.
> * json: keys are the filenames of files in `input/` (without extension),
for each filename, a list with the id of its sequences.
> * txt: consensus labels for each input file (majority vote among all labels from its sequences)
### 4e) Step 5: Analyze your results

Check the output files in `output/` (for more info about formats, see
@@ -256,6 +267,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):
download_asms Download only the assemblies
download_cobs Download only the COBS indexes
match Match queries using COBS (queries -> candidates)
label Label queries using labels from candidates (candidates -> labels) ** [new] **
map Map candidates to assemblies (candidates -> alignments)
#############
# Reporting #
@@ -289,6 +301,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):
the disk mode is used)
* `03_match/` COBS matches
* `04_filter/` Filtered candidates
* `labels/` Labels of the filtered candidates by query.
* `05_map/` Minimap2 alignments
* `logs/` Logs and benchmarks
* `output/` The resulting files (in a headerless SAM format)
@@ -299,7 +312,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):

**Input files:** FASTA or FASTQ files possibly compressed by gzipped. The files
are searched in the `input/` directory, as files with the following suffixes:
`.fa`, `.fasta`, `.fq`, `.fastq` (possibly with `.gz` at the end).
`.fa`, `.fasta`, `.fna`, `.fq`, `.fastq` (possibly with `.gz` at the end).

**Output files:**

46 changes: 40 additions & 6 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -10,8 +10,7 @@ import re
##################################


extensions = ["fa", "fasta", "fq", "fastq"]

extensions = ["fa", "fasta", "fna","fq", "fastq"]

def multiglob(patterns):
files = []
@@ -35,7 +34,7 @@ def get_batches():


def get_filename_for_all_queries():
return "___".join(get_all_query_filenames())
return "all_queries" #"___".join(get_all_query_filenames())


def get_index_metadata(wildcards, input):
@@ -176,8 +175,7 @@ elif index_load_mode == "mmap-disk":


wildcard_constraints:
batch=".+__\d\d",

batch=".+__\d\d"

if keep_cobs_indexes:

@@ -247,7 +245,15 @@ rule match:
"""
input:
f"intermediate/04_filter/{get_filename_for_all_queries()}.fa",
# "intermediate/04_filter/all_queries.fa"

rule label:
"""Label assemblies using labels from assemblies retrieved by querying COBS indexes
"""
input:
f"intermediate/04_filter/labels/labels_best_hits____{get_filename_for_all_queries()}.csv",
f"intermediate/04_filter/labels/queryid_by_input____{get_filename_for_all_queries()}.json",
f"intermediate/04_filter/labels/consensus_label____{get_filename_for_all_queries()}.txt",

rule map:
"""Map reads to the assemblies.
@@ -387,9 +393,11 @@ rule run_cobs:
"""
output:
match="intermediate/03_match/{batch}____{qfile}.gz",
# match="intermediate/03_match/{batch}____all_queries.gz",
input:
cobs_index=f"{decompression_dir}/{{batch}}.cobs_classic",
fa="intermediate/01_queries_merged/{qfile}.fa",
# fa="intermediate/01_queries_merged/all_queries.fa",
decompressed_indexes_sizes="data/decompressed_indexes_sizes.txt",
resources:
max_io_heavy_threads=int(cobs_is_an_IO_heavy_job),
@@ -431,6 +439,7 @@ rule decompress_and_run_cobs:
input:
compressed_cobs_index=f"{cobs_dir}/{{batch}}.cobs_classic.xz",
fa="intermediate/01_queries_merged/{qfile}.fa",
# fa="intermediate/01_queries_merged/all_queries.fa",
decompressed_indexes_sizes="data/decompressed_indexes_sizes.txt",
resources:
max_io_heavy_threads=int(cobs_is_an_IO_heavy_job),
@@ -579,7 +588,8 @@ rule final_stats:
stats="output/{qfile}.sam_summary.stats",
input:
pseudosam="output/{qfile}.sam_summary.gz",
concatenated_query=f"intermediate/01_queries_merged/{get_filename_for_all_queries()}.fa",
concatenated_query="intermediate/01_queries_merged/{qfile}.fa",
# concatenated_query=f"intermediate/01_queries_merged/all_queries.fa",
conda:
"envs/minimap2.yaml"
threads: 1
@@ -591,3 +601,27 @@ rule final_stats:
'./scripts/final_stats.py {input.concatenated_query} {input.pseudosam} \\
> {output.stats}'
"""


##################################
## [New] Processing rules
##################################

rule label_assemblies:
output:
labels_best_hits="intermediate/04_filter/labels/labels_best_hits____{qfile}.csv",
queryid_by_input="intermediate/04_filter/labels/queryid_by_input____{qfile}.json",
consensus_label="intermediate/04_filter/labels/consensus_label____{qfile}.txt",
input:
path_labels="data/labels_krakenbracken_by_sampleid.txt",
path_hits="intermediate/04_filter/{qfile}.fa",
params:
outdir="intermediate/04_filter/labels",
path_preprocessed="intermediate/00_queries_preprocessed",
shell:
"""
./scripts/benchmark.py --log logs/benchmarks/label/label_assemblies___{wildcards.qfile}.txt \\
'./scripts/labels_from_filter.py --path-labels {input.path_labels} \\
--path-hits {input.path_hits} --path-preprocessed {params.path_preprocessed} \\
--outdir {params.outdir}'
"""
4 changes: 2 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
##################################################
# general
# batches to consider during search
batches: "data/batches_full.txt"
batches: "data/batches_small.txt"
##################################################

##################################################
@@ -32,7 +32,7 @@ nb_best_hits: 100
# asm5/asm10/asm20: asm-to-ref mapping, for ~0.1/1/5% sequence divergence
# splice: long-read spliced alignment
# sr: genomic short-read mapping
minimap_preset: "sr"
minimap_preset: "sr" #"asm20"

# other minimap2 params
minimap_extra_params: "--eqx"
Binary file added data/all_queries.sam_summary.xz
Binary file not shown.
Loading