diff --git a/scripts/build_kraken2_db.sh b/scripts/build_kraken2_db.sh index 9ed1b93..5b5da80 100755 --- a/scripts/build_kraken2_db.sh +++ b/scripts/build_kraken2_db.sh @@ -125,7 +125,7 @@ else list_sequence_files | xargs -0 cat | \ build_db -k $KRAKEN2_KMER_LEN -l $KRAKEN2_MINIMIZER_LEN -S $KRAKEN2_SEED_TEMPLATE $KRAKEN2XFLAG \ -H hash.k2d.tmp -t taxo.k2d.tmp -o opts.k2d.tmp -n taxonomy/ -m $seqid2taxid_map_file \ - -c $required_capacity -p $KRAKEN2_THREAD_CT $max_db_flag + -c $required_capacity $KRAKEN2_PRIORITY_TAXIDS -p $KRAKEN2_THREAD_CT $max_db_flag finalize_file taxo.k2d finalize_file opts.k2d finalize_file hash.k2d diff --git a/scripts/kraken2-build b/scripts/kraken2-build index 2d5909e..7bd393f 100755 --- a/scripts/kraken2-build +++ b/scripts/kraken2-build @@ -44,6 +44,7 @@ my ( $minimizer_len, $kmer_len, $minimizer_spaces, + @priority_taxids, $is_protein, $masking, $max_db_size, @@ -67,6 +68,7 @@ $load_factor = $ENV{"KRAKEN2_LOAD_FACTOR"} || $DEF_LOAD_FACTOR; $kmer_len = $ENV{"KRAKEN2_KMER_LEN"}; $minimizer_len = $ENV{"KRAKEN2_MINIMIZER_LEN"}; $minimizer_spaces = $ENV{"KRAKEN2_MINIMIZER_SPACES"}; +@priority_taxids = $ENV{"KRAKEN2_PRIORITY_TAXIDS"} || (); $is_protein = $ENV{"KRAKEN2_PROTEIN_DB"} || 0; $use_ftp = $ENV{"KRAKEN2_USE_FTP"} || 0; $skip_maps = $ENV{"KRAKEN2_SKIP_MAPS"} || 0; @@ -92,6 +94,7 @@ GetOptions( "minimizer-len=i" => \$minimizer_len, "kmer-len=i" => \$kmer_len, "minimizer-spaces=i", \$minimizer_spaces, + "priority-taxids=s" => \@priority_taxids, "protein" => \$is_protein, "masking!" => \$masking, "max-db-size=i" => \$max_db_size, @@ -161,6 +164,10 @@ $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len; $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len; $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces; $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template(); +$ENV{"KRAKEN2_PRIORITY_TAXIDS"} = ""; +if (@priority_taxids) { + $ENV{"KRAKEN2_PRIORITY_TAXIDS"} = "-P " . join(':', split(/,|:/, join(',', @priority_taxids))); +} $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : ""; $ENV{"KRAKEN2_MASK_LC"} = $masking ? 1 : ""; $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : ""; @@ -230,6 +237,10 @@ Options: ignored in comparisons (build task only; def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa) --protein Build a protein database for translated search + --priority-taxids NUM Taxids whose k-mers override other taxids. By default includes + 32630: synthetic construct and 81077: artifical sequences. Specifying + any taxids will clear these two default taxids. Multiple taxids can be + separated by colon (:), comma (,) or repeated args. --no-masking Used with --standard/--download-library/ --add-to-library to avoid masking low-complexity sequences prior to building; masking requires diff --git a/src/build_db.cc b/src/build_db.cc index 82f2cbb..66d0467 100644 --- a/src/build_db.cc +++ b/src/build_db.cc @@ -27,6 +27,11 @@ using namespace kraken2; #define DEFAULT_BLOCK_SIZE (10 * 1024 * 1024) // 10 MB +// k-mers appearing in contaminant sequences will keep the contaminant +// sequence taxid, even if they also appear in a genome +const uint64_t TID_CONTAMINANT1 = 32630; // 'synthetic construct' +const uint64_t TID_CONTAMINANT2 = 81077; // 'artificial sequences' + struct Options { string ID_to_taxon_map_filename; string ncbi_taxonomy_directory; @@ -36,6 +41,7 @@ struct Options { size_t block_size; int num_threads; bool input_is_protein; + vector priority_taxids; ssize_t k, l; size_t capacity; size_t maximum_capacity; @@ -65,6 +71,8 @@ int main(int argc, char **argv) { opts.spaced_seed_mask = DEFAULT_SPACED_SEED_MASK; opts.toggle_mask = DEFAULT_TOGGLE_MASK; opts.input_is_protein = false; + opts.priority_taxids.push_back(TID_CONTAMINANT1); + opts.priority_taxids.push_back(TID_CONTAMINANT2); opts.num_threads = 1; opts.block_size = DEFAULT_BLOCK_SIZE; opts.min_clear_hash_value = 0; @@ -82,6 +90,9 @@ int main(int argc, char **argv) { Taxonomy taxonomy(opts.taxonomy_filename.c_str()); taxonomy.GenerateExternalToInternalIDMap(); + for (auto taxid : opts.priority_taxids) { + taxonomy.priority_taxids.insert(taxonomy.GetInternalID(taxid)); + } size_t bits_needed_for_value = 1; while ((1 << bits_needed_for_value) < (ssize_t) taxonomy.node_count()) bits_needed_for_value++; @@ -176,6 +187,23 @@ void ProcessSequences(Options &opts, map &ID_to_taxon_map, std::cerr << "Completed processing of " << processed_seq_ct << " sequences, " << processed_ch_ct << " " << (opts.input_is_protein ? "aa" : "bp") << std::endl; } +// Split string by any of delimiters, convert to long, and insert to iterator. +template +OutIt SplitInsert(char* input, string delimiters, OutIt out) { + std::string line(input); + std::size_t prev = 0, pos; + while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) { + if (pos > prev) { + *out++ = stol(line.substr(prev, pos-prev)); + } + prev = pos+1; + } + if (prev < line.length()) { + *out++ = stol(line.substr(prev, std::string::npos)); + } + return out; +} + // This function exists to deal with NCBI's use of \x01 characters to denote // the start of a new FASTA header in the same line (for non-redundant DBs). // We return all sequence IDs in a header line, not just the first. @@ -213,8 +241,9 @@ vector ExtractNCBISequenceIDs(const string &header) { void ParseCommandLine(int argc, char **argv, Options &opts) { int opt; long long sig; + bool priority_taxids_cleared = false; - while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:M:X")) != -1) { + while ((opt = getopt(argc, argv, "?hB:c:H:m:n:o:t:k:l:s:S:T:p:P:M:X")) != -1) { switch (opt) { case 'h' : case '?' : usage(0); @@ -279,6 +308,13 @@ void ParseCommandLine(int argc, char **argv, Options &opts) { errx(EX_USAGE, "max capacity must be positive integer"); opts.maximum_capacity = sig; break; + case 'P' : + if (!priority_taxids_cleared) { + opts.priority_taxids.clear(); + priority_taxids_cleared = true; + } + SplitInsert(optarg, ",:", back_inserter(opts.priority_taxids)); + break; case 'X' : opts.input_is_protein = true; break; @@ -326,6 +362,7 @@ void usage(int exit_code) { << " -M INT Set maximum capacity of hash table (MiniKraken)\n" << " -S BITSTRING Spaced seed mask\n" << " -T BITSTRING Minimizer toggle mask\n" + << " -A INT(:INT) Priority taxids that don't get LCA'd\n" << " -X Input seqs. are proteins\n" << " -p INT Number of threads\n" << " -B INT Read block size" << endl; @@ -336,6 +373,7 @@ void ProcessSequence(string &seq, uint64_t taxid, CompactHashTable &hash, Taxonomy &tax, MinimizerScanner &scanner, uint64_t min_clear_hash_value) { + bool seq_is_priority = tax.priority_taxids.count(taxid); scanner.LoadSequence(seq); uint64_t *minimizer_ptr; bool ambig_flag; @@ -346,8 +384,18 @@ void ProcessSequence(string &seq, uint64_t taxid, continue; hvalue_t existing_taxid = 0; hvalue_t new_taxid = taxid; - while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) { - new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid); + if (seq_is_priority) { + hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid) || + hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid); + } + else { + while (! hash.CompareAndSet(*minimizer_ptr, new_taxid, &existing_taxid)) { + if (! tax.priority_taxids.count(existing_taxid)) { + new_taxid = tax.LowestCommonAncestor(new_taxid, existing_taxid); + } else { + new_taxid = existing_taxid; + } + } } } } diff --git a/src/taxonomy.h b/src/taxonomy.h index 8d43604..9b02d16 100644 --- a/src/taxonomy.h +++ b/src/taxonomy.h @@ -64,6 +64,7 @@ class Taxonomy { return 0; return external_to_internal_id_map_[external_id]; } + std::set priority_taxids; private: void Init(const char *filename, bool memory_mapping);