karel-brinda · jorgeavilacartes · Feb 21, 2024 · Feb 21, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ intermediate/*/*.fa
 intermediate/*/*.sam
 intermediate/*/*.txt
 intermediate/*/*.gz
+intermediate/*/labels/*
+.vscode/
 *.tmp
diff --git a/Makefile b/Makefile
@@ -30,9 +30,10 @@ DOWNLOAD_PARAMS=--cores $(MAX_DOWNLOAD_THREADS) -j $(MAX_DOWNLOAD_THREADS) --res
 all: ## Run everything (the default rule)
 	make download
 	make match
+	make label
 	make map
 
-DIFF_CMD=diff -q <(gunzip --stdout output/reads_1___reads_2___reads_3___reads_4.sam_summary.gz | cut -f -3) <(xzcat data/reads_1___reads_2___reads_3___reads_4.sam_summary.xz | cut -f -3)
+DIFF_CMD=diff -q <(gunzip --stdout output/all_queries.sam_summary.gz | cut -f -3) <(xzcat data/all_queries.sam_summary.xz | cut -f -3)
 
 test: ## Quick test using 3 batches
 	snakemake download $(SMK_PARAMS) $(DOWNLOAD_PARAMS) --config batches=data/batches_small.txt  # download is not benchmarked
@@ -49,6 +50,8 @@ test: ## Quick test using 3 batches
 	    exit 1;\
 	fi
 
+# echo gunzip --stdout output/all_queries.sam_summary.gz | cut -f -3 | less;
+
 help: ## Print help messages
 	@echo -e "$$(grep -hE '^\S*(:.*)?##' $(MAKEFILE_LIST) \
 		| sed \
@@ -60,6 +63,7 @@ help: ## Print help messages
 		| column -c2 -t -s : )"
 
 clean: ## Clean intermediate search files
+	rm -rfv intermediate/04_filter/labels
 	rm -fv intermediate/*/*
 	rm -rfv logs
 	rm -fv output/*
@@ -91,6 +95,8 @@ match: ## Match queries using COBS (queries -> candidates)
 map: ## Map candidates to assemblies (candidates -> alignments)
 	scripts/benchmark.py --log logs/benchmarks/map_$(DATETIME).txt   "snakemake map $(SMK_PARAMS)"
 
+label: ## Label assemblies using labels from assemblies retrieved by querying COBS indexes
+	scripts/benchmark.py --log logs/benchmarks/label_$(DATETIME).txt "snakemake label $(SMK_PARAMS)"
 ###############
 ## Reporting ##
 ###############

diff --git a/README.md b/README.md
@@ -218,6 +218,17 @@ Simply run `make`, which will execute Snakemake with the corresponding
 parameters. If you want to run the pipeline step by step, run `make match`
 followed by `make map`.
 
+
+>    **Optional** 
+>    replace `make match` with `make label` (running `make label` right after `make match` will also work).
+>    This will use the outputs from `make match` to generate the labels (at species taxonomic level) of the matches.
+>    The labels are saved here: `intermediate/04_filter/labels`, and there you will find
+>    * CSV: in each row the first element is the query-id,
+          the other values correspond to the labels of the filtered 
+          `intermediate/04_filter/{name}.fa`, in the same order.
+>    * json: keys are the filenames of files in `input/` (without extension), 
+          for each filename, a list with the id of its sequences.
+>    * txt: consensus labels for each input file (majority vote among all labels from its sequences)
 ### 4e) Step 5: Analyze your results
 
 Check the output files in `output/` (for more info about formats, see
@@ -256,6 +267,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):
    download_asms      Download only the assemblies
    download_cobs      Download only the COBS indexes
    match              Match queries using COBS (queries -> candidates)
+   label              Label queries using labels from candidates (candidates -> labels) ** [new] **
    map                Map candidates to assemblies (candidates -> alignments)
 #############
 # Reporting #
@@ -289,6 +301,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):
      the disk mode is used)
    * `03_match/` COBS matches
    * `04_filter/` Filtered candidates
+     * `labels/` Labels of the filtered candidates by query.
    * `05_map/` Minimap2 alignments
 * `logs/` Logs and benchmarks
 * `output/` The resulting files (in a headerless SAM format)
@@ -299,7 +312,7 @@ Here's a list of all implemented commands (to be executed as `make {command}`):
 
 **Input files:** FASTA or FASTQ files possibly compressed by gzipped. The files
 are searched in the `input/` directory, as files with the following suffixes:
-`.fa`, `.fasta`, `.fq`, `.fastq` (possibly with `.gz` at the end).
+`.fa`, `.fasta`, `.fna`, `.fq`, `.fastq` (possibly with `.gz` at the end).
 
 **Output files:**
 

diff --git a/Snakefile b/Snakefile
@@ -10,8 +10,7 @@ import re
 ##################################
 
 
-extensions = ["fa", "fasta", "fq", "fastq"]
-
+extensions = ["fa", "fasta", "fna","fq", "fastq"]
 
 def multiglob(patterns):
     files = []
@@ -35,7 +34,7 @@ def get_batches():
 
 
 def get_filename_for_all_queries():
-    return "___".join(get_all_query_filenames())
+    return "all_queries" #"___".join(get_all_query_filenames())
 
 
 def get_index_metadata(wildcards, input):
@@ -176,8 +175,7 @@ elif index_load_mode == "mmap-disk":
 
 
 wildcard_constraints:
-    batch=".+__\d\d",
-
+    batch=".+__\d\d"
 
 if keep_cobs_indexes:
 
@@ -247,7 +245,15 @@ rule match:
     """
     input:
         f"intermediate/04_filter/{get_filename_for_all_queries()}.fa",
+        # "intermediate/04_filter/all_queries.fa"
 
+rule label:
+    """Label assemblies using labels from assemblies retrieved by querying COBS indexes
+    """
+    input:
+        f"intermediate/04_filter/labels/labels_best_hits____{get_filename_for_all_queries()}.csv",
+        f"intermediate/04_filter/labels/queryid_by_input____{get_filename_for_all_queries()}.json",
+        f"intermediate/04_filter/labels/consensus_label____{get_filename_for_all_queries()}.txt",
 
 rule map:
     """Map reads to the assemblies.
@@ -387,9 +393,11 @@ rule run_cobs:
     """
     output:
         match="intermediate/03_match/{batch}____{qfile}.gz",
+        # match="intermediate/03_match/{batch}____all_queries.gz",
     input:
         cobs_index=f"{decompression_dir}/{{batch}}.cobs_classic",
         fa="intermediate/01_queries_merged/{qfile}.fa",
+        # fa="intermediate/01_queries_merged/all_queries.fa",
         decompressed_indexes_sizes="data/decompressed_indexes_sizes.txt",
     resources:
         max_io_heavy_threads=int(cobs_is_an_IO_heavy_job),
@@ -431,6 +439,7 @@ rule decompress_and_run_cobs:
     input:
         compressed_cobs_index=f"{cobs_dir}/{{batch}}.cobs_classic.xz",
         fa="intermediate/01_queries_merged/{qfile}.fa",
+        # fa="intermediate/01_queries_merged/all_queries.fa",
         decompressed_indexes_sizes="data/decompressed_indexes_sizes.txt",
     resources:
         max_io_heavy_threads=int(cobs_is_an_IO_heavy_job),
@@ -579,7 +588,8 @@ rule final_stats:
         stats="output/{qfile}.sam_summary.stats",
     input:
         pseudosam="output/{qfile}.sam_summary.gz",
-        concatenated_query=f"intermediate/01_queries_merged/{get_filename_for_all_queries()}.fa",
+        concatenated_query="intermediate/01_queries_merged/{qfile}.fa",
+        # concatenated_query=f"intermediate/01_queries_merged/all_queries.fa",
     conda:
         "envs/minimap2.yaml"
     threads: 1
@@ -591,3 +601,27 @@ rule final_stats:
             './scripts/final_stats.py {input.concatenated_query} {input.pseudosam} \\
                 > {output.stats}'
         """
+
+
+##################################
+## [New] Processing rules
+##################################
+
+rule label_assemblies:
+    output:
+        labels_best_hits="intermediate/04_filter/labels/labels_best_hits____{qfile}.csv",
+        queryid_by_input="intermediate/04_filter/labels/queryid_by_input____{qfile}.json",
+        consensus_label="intermediate/04_filter/labels/consensus_label____{qfile}.txt",
+    input:
+        path_labels="data/labels_krakenbracken_by_sampleid.txt",
+        path_hits="intermediate/04_filter/{qfile}.fa",
+    params:
+        outdir="intermediate/04_filter/labels",
+        path_preprocessed="intermediate/00_queries_preprocessed",
+    shell:
+        """
+        ./scripts/benchmark.py --log logs/benchmarks/label/label_assemblies___{wildcards.qfile}.txt \\
+            './scripts/labels_from_filter.py --path-labels {input.path_labels} \\
+                --path-hits {input.path_hits} --path-preprocessed {params.path_preprocessed} \\
+                    --outdir {params.outdir}'
+        """
diff --git a/config.yaml b/config.yaml
@@ -4,7 +4,7 @@
 ##################################################
 # general
 # batches to consider during search
-batches: "data/batches_full.txt"
+batches: "data/batches_small.txt"
 ##################################################
 
 ##################################################
@@ -32,7 +32,7 @@ nb_best_hits: 100
 # asm5/asm10/asm20: asm-to-ref mapping, for ~0.1/1/5% sequence divergence
 # splice: long-read spliced alignment
 # sr: genomic short-read mapping
-minimap_preset: "sr"
+minimap_preset: "sr" #"asm20"
 
 # other minimap2 params
 minimap_extra_params: "--eqx"

diff --git a/data/all_queries.sam_summary.xz b/data/all_queries.sam_summary.xz