diff --git a/.gitignore b/.gitignore index 549bfa6..0e6781d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ /libs/winflexbison /src/clusty/x64/Debug/clusty.tlog /src/clusty/x64/Debug -/src +/src/clusty/ + +/src/clusty.vcxproj.user diff --git a/makefile b/makefile index 02d476f..2b638bf 100644 --- a/makefile +++ b/makefile @@ -101,9 +101,10 @@ OBJS := \ $(MIMALLOC_OBJ) \ $(MAIN_DIR)/console.o \ $(MAIN_DIR)/conversion.o \ - $(MAIN_DIR)/distances.o \ + $(MAIN_DIR)/graph.o \ $(MAIN_DIR)/log.o \ $(MAIN_DIR)/main.o \ + $(MAIN_DIR)/params.o \ %.o: %.cpp igraph $(CXX) $(CFLAGS) -c $< -o $@ diff --git a/src/cd_hit.h b/src/cd_hit.h index 0791f20..cf7f070 100644 --- a/src/cd_hit.h +++ b/src/cd_hit.h @@ -17,12 +17,12 @@ #include #include -template -class CdHit : public IClustering { +template +class CdHit : public IClustering { public: int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) override { @@ -41,10 +41,10 @@ class CdHit : public IClustering { assignments[obj] = cluster_id; // iterate over connected object and assign those which are unassigned - for (const dist_t* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { - int other = edge->u.s.hi; + for (const Distance* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { + int other = edge->get_id(); - if (edge->d <= threshold && assignments[other] == -1) { + if (edge->get_d() <= threshold && assignments[other] == -1) { assignments[other] = cluster_id; } } diff --git a/src/clustering.h b/src/clustering.h index 58e92eb..f035e4f 100644 --- a/src/clustering.h +++ b/src/clustering.h @@ -10,8 +10,9 @@ #include #include +#include -#include "distances.h" +#include "sparse_matrix.h" struct node_t { int first = -1; @@ -23,12 +24,12 @@ struct node_t { : first(first), second(second), distance(distance) {} }; -template +template class IClustering { public: virtual int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) = 0; @@ -37,12 +38,12 @@ class IClustering { }; -template -class HierarchicalClustering : public IClustering { +template +class HierarchicalClustering : public IClustering { protected: void makeDendrogram( - const std::vector& lambda, + const std::vector& lambda, const std::vector& pi, std::vector& dendrogram) { @@ -51,7 +52,7 @@ class HierarchicalClustering : public IClustering { std::vector elements(n_objects - 1); std::iota(elements.begin(), elements.end(), 0); - stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) { + std::stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) { return lambda[x] < lambda[y]; }); @@ -64,7 +65,7 @@ class HierarchicalClustering : public IClustering { for (int i = 0; i < n_objects - 1; ++i) { int j = elements[i]; int next = pi[j]; - dendrogram.emplace_back(index[j], index[next], lambda[j].d); + dendrogram.emplace_back(index[j], index[next], lambda[j].get_d()); index[next] = n_objects + i; } } diff --git a/src/clusty.vcxproj b/src/clusty.vcxproj index dcb0fdc..1ef1412 100644 --- a/src/clusty.vcxproj +++ b/src/clusty.vcxproj @@ -133,7 +133,7 @@ _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true false - stdcpp17 + stdcpplatest true @@ -151,7 +151,7 @@ NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true false - stdcpp17 + stdcpplatest Console @@ -165,9 +165,10 @@ - + + @@ -179,8 +180,13 @@ + + + + + diff --git a/src/clusty.vcxproj.filters b/src/clusty.vcxproj.filters index 87e2276..b5b6ce2 100644 --- a/src/clusty.vcxproj.filters +++ b/src/clusty.vcxproj.filters @@ -13,11 +13,12 @@ - + Library Files + @@ -34,5 +35,10 @@ + + + + + \ No newline at end of file diff --git a/src/console.cpp b/src/console.cpp index 7c9131d..a7b8eab 100644 --- a/src/console.cpp +++ b/src/console.cpp @@ -7,109 +7,207 @@ // ******************************************************************************************* #include "console.h" -#include "distances.h" #include "log.h" +#include "graph_named.h" +#include "graph_numbered.h" +#include "sparse_matrix.h" + +#define VAL(str) #str +#define TOSTRING(str) VAL(str) +#include "version.h" + #include +#include +#include #include -#include +#include using namespace std; -void Console::printUsage() const { - LOG_NORMAL << "Usage:" << endl - << "clusty [options] " << endl << endl - << "Parameters:" << endl - << " - input TSV/CSV table with pairwise distances" << endl - << " - output TSV/CSV table with assignments" << endl << endl - << "Options:" << endl - << " " + PARAM_FILE_OBJECTS + " - optional TSV/CSV file with object names in the first column sorted decreasingly w.r.t. representativness" << endl - << " " + PARAM_ALGO + " - clustering algorithm:" << endl - << " * single - single linkage (connected component, MMseqs mode 1)" << endl - << " * complete - complete linkage" << endl - << " * uclust - UCLUST" << endl - << " * set-cover - greedy set cover (MMseqs mode 0)" << endl - << " * cd-hit - CD-HIT (greedy incremental; MMseqs mode 2)" << endl - << " * leiden - Leiden algorithm" << endl << endl - - << " " + PARAM_ID_COLUMNS + " - names of columns with sequence identifiers (default: two first columns)" << endl - << " " + PARAM_DISTANCE_COLUMN + " - name of the column with pairwise distances (or similarities; default: third column)" << endl - << " " + FLAG_SIMILARITY + " - use similarity instead of distances (has to be in [0,1] interval; default: false)" << endl - << " " + FLAG_PERCENT_SIMILARITY + " - use percent similarity (has to be in [0,100] interval; default: false)" << endl - << " " + PARAM_MIN + " - accept pairwise connections with values greater or equal given threshold in a specified column" << endl - << " " + PARAM_MAX + " - accept pairwise connections with values lower or equal given threshold in a specified column" << endl - << " " + FLAG_NUMERIC_IDS + " - use when sequences in the distances file are represented by numbers (can be mapped to string ids by the object file)" << endl - << " " + FLAG_OUT_REPRESENTATIVES + " - output a representative object for each cluster instead of a cluster numerical identifier (default: " << std::boolalpha << outputRepresentatives << ")" << endl - << " " + FLAG_OUT_CSV + " - output a CSV table instead of a default TSV (default: " << std::boolalpha << outputCSV <<")" - -#ifndef NO_LEIDEN - << endl << endl - << " " + PARAM_LEIDEN_RESOLUTION + " - resolution parameter for Leiden algorithm (default: " << leidenParams.resolution << ")" << endl - << " " + PARAM_LEIDEN_BETA + " - beta parameter for Leiden algorithm (default: " << leidenParams.beta << ")" << endl - << " " + PARAM_LEIDEN_RESOLUTION + " - number of interations for Leiden algorithm (default: " << leidenParams.numIterations << ")" -#endif - << endl << endl; -} +// ******************************************************************************************* +bool Console::init(int argc, char** argv, Params& params) { + Log::getInstance(Log::LEVEL_NORMAL).enable(); + LOG_NORMAL << "Clusty" << endl + << " version " << VERSION +#ifdef GIT_COMMIT + << "-" << TOSTRING(GIT_COMMIT) +#endif + << " (" << DATE << ")" << endl << endl; -bool Console::parse(int argc, char** argv) { + if (!params.parse(argc, argv)) { + params.printUsage(); + return false; + } - vector args; - for (int i = 1; i < argc; ++i) { - args.emplace_back(argv[i]); + if (params.verbose) { + Log::getInstance(Log::LEVEL_VERBOSE).enable(); } - if (args.size() < 2) { - return false; + return true; +} + +// ******************************************************************************************* +std::unique_ptr Console::loadGraph(const Params& params) { + + unique_ptr graph; + + if (params.numericIds) { + if (needDistances(params.algo)) { + graph = make_unique>(); + } + else { + graph = make_unique>(); + } + } + else { + if (needDistances(params.algo)) { + graph = make_unique>(); + } + else { + graph = make_unique>(); + } } - std::string tmp; - findOption(args, PARAM_ALGO, tmp); - if (tmp.length()) { - algo = str2algo(tmp); + LOG_NORMAL << "Loading pairwise distances from " << params.distancesFile << "... "; + auto t = std::chrono::high_resolution_clock::now(); + + vector filebuf(128ULL << 20); // 128MB buffer + ifstream ifs; + ifs.rdbuf()->pubsetbuf(filebuf.data(), filebuf.size()); + ifs.open(params.distancesFile, ios_base::binary); + + if (!ifs) { + throw std::runtime_error("Unable to open distance file"); } - findOption(args, PARAM_FILE_OBJECTS, objectsFile); + map transforms{ + { DistanceSpecification::Distance, [](double d) { return d; } }, + { DistanceSpecification::Similarity, [](double d) { return 1.0 - d; } }, + { DistanceSpecification::PercentSimilarity, [](double d) { return 1.0 - d * 0.01; } }, + }; - findOption(args, PARAM_ID_COLUMNS, idColumns.first, idColumns.second); - numericIds = findSwitch(args, FLAG_NUMERIC_IDS); + size_t n_total_dists = graph->load(ifs, params.idColumns, params.distanceColumn, + transforms[params.distanceSpecification], params.columns2filters); + + auto dt = std::chrono::high_resolution_clock::now() - t; - findOption(args, PARAM_DISTANCE_COLUMN, distanceColumn); + ifs.close(); + LOG_NORMAL << endl + << " input graph: " << graph->getNumInputVertices() << " nodes, " << n_total_dists << " edges" << endl + << " filtered graph: " << graph->getNumVertices() << " nodes, " << graph->getNumEdges() << " edges" << endl + << " time [s]: " << chrono::duration(dt).count() << endl; - bool use_similarity = findSwitch(args, FLAG_SIMILARITY); - bool use_percent_similarity = findSwitch(args, FLAG_PERCENT_SIMILARITY); + return graph; +} - if (use_percent_similarity) { - distanceSpecification = DistanceSpecification::PercentSimilarity; - } else if (use_similarity) { - distanceSpecification = DistanceSpecification::Similarity; +// ******************************************************************************************* +void Console::loadObjects( + const Params& params, + const Graph& graph, + std::vector& objects, + std::vector& names) { + + names.clear(); + objects.resize(graph.getNumVertices()); + std::iota(objects.begin(), objects.end(), 0); + ifstream ifs; + + if (!params.objectsFile.empty()) { + LOG_NORMAL << "Loading objects from " << params.objectsFile << "... "; + ifs.open(params.objectsFile); + + if (ifs) { + auto t = std::chrono::high_resolution_clock::now(); + string token; + token.reserve(16 << 10); // 16k buffer + + // omit header + getline(ifs, token); + auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; + + while (getline(ifs, token)) { + auto it = find_if(token.begin(), token.end(), is_sep); + if (it != token.begin()) { + names.emplace_back(token.begin(), it); + } + } + + ifs.close(); + LOG_NORMAL << endl; + + graph.reorderObjects(names, objects); + + auto dt = std::chrono::high_resolution_clock::now() - t; + LOG_NORMAL << " total objects: " << names.size() << endl + << " time [s]: " << chrono::duration(dt).count() << endl; + } + else { + throw std::runtime_error("Unable to open objects file"); + } } +} + - string column; - double value; - while (findOption(args, PARAM_MIN, column, value)) { - columns2filters[column].min = std::max(value, columns2filters[column].min); +// ******************************************************************************************* +void Console::doClustering( + const Params& params, + Graph& graph, + const std::vector& objects, + std::vector& assignments) +{ + assignments.clear(); + + LOG_NORMAL << "Clustering (algorithm: " << Params::algo2str(params.algo) << ")... "; + + auto t = std::chrono::high_resolution_clock::now(); + double threshold = std::nexttoward(std::numeric_limits::max(), 0.0); + int n_clusters = 0; + + if (needDistances(params.algo)) { + auto clustering = createClusteringAlgo(params.algo); + IMatrix& mat = graph.getMatrix(); + SparseMatrix& distances = static_cast&>(mat); + n_clusters = (*clustering)(distances, objects, threshold, assignments); } - while (findOption(args, PARAM_MAX, column, value)) { - columns2filters[column].max = std::min(value, columns2filters[column].max); + else { + auto clustering = createClusteringAlgo(params.algo); + IMatrix& mat = graph.getMatrix(); + SparseMatrix& distances = static_cast&>(mat); + n_clusters = (*clustering)(distances, objects, threshold, assignments); } - outputRepresentatives = findSwitch(args, FLAG_OUT_REPRESENTATIVES); - outputCSV = findSwitch(args, FLAG_OUT_CSV); + auto dt = std::chrono::high_resolution_clock::now() - t; + + LOG_NORMAL << endl + << " objects: " << graph.getNumVertices() << ", clusters: " << n_clusters << endl + << " time [s]: " << chrono::duration(dt).count() << endl; +} - // leiden parameters - findOption(args, PARAM_LEIDEN_RESOLUTION, leidenParams.resolution); - findOption(args, PARAM_LEIDEN_BETA, leidenParams.beta); - findOption(args, PARAM_LEIDEN_ITERATIONS, leidenParams.numIterations); +// ******************************************************************************************* +void Console::saveAssignments( + const Params& params, + const Graph& graph, + const std::vector& names, + const std::vector& assignments) { - verbose = findSwitch(args, FLAG_VERBOSE); + LOG_NORMAL << "Saving clusters... "; + auto t = std::chrono::high_resolution_clock::now(); - if (args.size() == 2) { - distancesFile = args[0]; - output = args[1]; - return true; - } + char sep = params.outputCSV ? ',' : '\t'; - return false; -} + ofstream ofs(params.output); + int n_total_clusters = 0; + if (params.outputRepresentatives) { + n_total_clusters = graph.saveRepresentatives(ofs, names, assignments, sep); + } + else { + n_total_clusters = graph.saveAssignments(ofs, names, assignments, sep); + } + auto dt = std::chrono::high_resolution_clock::now() - t; + LOG_NORMAL << endl + << " total clusters (including singletons): " << n_total_clusters << endl + << " time [s]: " << chrono::duration(dt).count() << endl; +} \ No newline at end of file diff --git a/src/console.h b/src/console.h index 747bdb8..ca7f3df 100644 --- a/src/console.h +++ b/src/console.h @@ -1,5 +1,3 @@ -#pragma once - // ******************************************************************************************* // This file is a part of Clusty software distributed under GNU GPL 3 license. // The homepage of the Clusty project is https://github.com/refresh-bio/Clusty @@ -7,147 +5,79 @@ // Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz // // ******************************************************************************************* - -#include "distances.h" +#pragma once +#include "params.h" +#include "graph.h" + +#include "linkage_heaptrix.h" +#include "uclust.h" +#include "set_cover.h" +#include "single_bfs.h" +#include "cd_hit.h" #include "leiden.h" +#include +#include #include -#include -#include -#include - -enum class Algo { - SingleLinkage, - CompleteLinkage, - UClust, - SetCover, - CdHit, - Leiden -}; - -enum class DistanceSpecification { - Distance, - Similarity, - PercentSimilarity -}; class Console { - const std::string PARAM_ALGO{ "--algo" }; - - const std::string PARAM_FILE_OBJECTS{"--objects-file"}; - - const std::string PARAM_ID_COLUMNS{"--id-cols"}; - const std::string FLAG_NUMERIC_IDS{"--numeric-ids"}; - const std::string PARAM_DISTANCE_COLUMN{"--distance-col"}; - const std::string FLAG_SIMILARITY{"--similarity"}; - const std::string FLAG_PERCENT_SIMILARITY{"--percent-similarity"}; - - const std::string PARAM_MAX{"--max"}; - const std::string PARAM_MIN{"--min"}; - - const std::string FLAG_OUT_REPRESENTATIVES{"--out-representatives"}; - const std::string FLAG_OUT_CSV{"--out-csv"}; - - const std::string PARAM_LEIDEN_RESOLUTION{"--leiden-resolution"}; - const std::string PARAM_LEIDEN_BETA{"--leiden-beta"}; - const std::string PARAM_LEIDEN_ITERATIONS{"--leiden-iterations"}; - - const std::string FLAG_VERBOSE{ "-v" }; + Params params; public: - static Algo str2algo(const std::string& str) - { - if (str == "single") { return Algo::SingleLinkage; } - else if (str == "complete") { return Algo::CompleteLinkage; } - else if (str == "uclust") { return Algo::UClust; } - else if (str == "set-cover") { return Algo::SetCover; } - else if (str == "cd-hit") { return Algo::CdHit; } - else if (str == "leiden") { return Algo::Leiden; } - - else { throw std::runtime_error("Unkown clustering algorithm"); } - } - - static std::string algo2str(Algo algo) { - switch (algo) { - case Algo::SingleLinkage: return "single"; - case Algo::CompleteLinkage: return "complete"; - case Algo::UClust: return "uclust"; - case Algo::SetCover: return "set-cover"; - case Algo::Leiden: return "leiden"; - case Algo::CdHit: return "cd-hit"; - default: throw std::runtime_error("Unkown clustering algorithm"); - } - } - - -public: - - std::string objectsFile; - std::string distancesFile; - std::string output; - - Algo algo{ Algo::SingleLinkage }; - - std::pair idColumns; - bool numericIds{ false }; - - std::string distanceColumn; - DistanceSpecification distanceSpecification{ DistanceSpecification::Distance }; + bool init(int argc, char** argv, Params& params); - std::map columns2filters; - bool outputRepresentatives{ false }; - bool outputCSV{ false }; - - LeidenParams leidenParams; - - bool verbose{ false }; - - void printUsage() const; - bool parse(int argc, char** argv); - - bool findSwitch(std::vector& params, const std::string& name) { - auto it = find(params.begin(), params.end(), name); // verbose mode - if (it != params.end()) { - params.erase(it); - return true; - } - - return false; + std::unique_ptr loadGraph(const Params& params); + + void loadObjects( + const Params& params, + const Graph& graph, + std::vector& objects, + std::vector& names); + + void doClustering( + const Params& params, + Graph& graph, + const std::vector& objects, + std::vector& assignments); + + void saveAssignments( + const Params& params, + const Graph& graph, + const std::vector& names, + const std::vector& assignments); + +protected: + + bool needDistances(Algo algo) const { + return (algo == Algo::CompleteLinkage || algo == Algo::Leiden || algo == Algo::UClust); } - template - bool findOption(std::vector& params, const std::string& name, T& v) { - if (params.size() < 2) { - return false; - } + template + std::unique_ptr> createClusteringAlgo(Algo algo) { - auto stop = std::prev(params.end()); - auto it = find(params.begin(), stop, name); // verbose mode - if (it != stop) { - std::istringstream iss(*std::next(it)); - if (iss >> v) { - params.erase(it, it + 2); - return true; - } + std::unique_ptr> clustering; + + switch (algo) + { + + case Algo::SingleLinkage: + clustering = std::make_unique>(); break; + case Algo::CompleteLinkage: + clustering = std::make_unique>(); break; + case Algo::UClust: + clustering = std::make_unique>(); break; + case Algo::SetCover: + clustering = std::make_unique>(); break; + case Algo::CdHit: + clustering = std::make_unique>(); break; + case Algo::Leiden: + clustering = std::make_unique>(params.leidenParams); break; + + default: + throw std::runtime_error("Unkown clustering algorithm"); } - return false; - } - template - bool findOption(std::vector& params, const std::string& name, T& value1, U& value2) { - if (params.size() < 3) { - return false; - } - - auto stop = std::prev(params.end(), 2); - auto it = find(params.begin(), stop, name); // verbose mode - if (it != stop) { - if (std::istringstream(*std::next(it)) >> value1 - && std::istringstream(*std::next(it, 2)) >> value2) { - params.erase(it, it + 3); - return true; - } - } - return false; + return clustering; } + }; diff --git a/src/distances.cpp b/src/distances.cpp deleted file mode 100644 index e0ea0c6..0000000 --- a/src/distances.cpp +++ /dev/null @@ -1,735 +0,0 @@ -// ******************************************************************************************* -// This file is a part of Clusty software distributed under GNU GPL 3 license. -// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty -// -// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz -// -// ******************************************************************************************* - -#include "distances.h" -#include "log.h" - -#include -#include -#include - -using namespace std; - -const double dist_t::MAX = std::numeric_limits::max(); - -/*********************************************************************************************************************/ -void SparseMatrix::processHeader( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - std::map& columns2filters, - int idColumnsOut[2], - int& distanceColumnOut, - std::vector& filters) { - - std::string line; - std::getline(ifs, line); - - std::replace(line.begin(), line.end(), ',', ' '); - std::istringstream iss(line); - std::vector columns; - std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(columns)); - - if (columns.size() < 3) { - throw std::runtime_error("Error loading distances: at least three columns are required"); - } - - // default column identifiers - int col_ids[]{ 0, 1 }; - int col_distance = 2; // by default use 3rd column as the one with distance - - int n_columns = (int)columns.size(); - - if (!idColumns.first.empty()) { - col_ids[0] = std::find(columns.begin(), columns.end(), idColumns.first) - columns.begin(); - col_ids[1] = std::find(columns.begin(), columns.end(), idColumns.second) - columns.begin(); - - if (col_ids[0] == n_columns || col_ids[1] == n_columns) { - throw std::runtime_error("Error loading distances: id columns not found"); - } - - if (col_ids[0] > col_ids[1]) { - std::swap(col_ids[0], col_ids[1]); - } - - } - - if (!distanceColumn.empty()) { - col_distance = std::find(columns.begin(), columns.end(), distanceColumn) - columns.begin(); - if (col_distance == n_columns) { - throw std::runtime_error("Error loading distances: " + distanceColumn + " column not found"); - } - } - - filters.resize(columns.size()); - - for (const auto& f : columns2filters) { - int col = std::find(columns.begin(), columns.end(), f.first) - columns.begin(); - if (col == n_columns) { - throw std::runtime_error("Error loading distances: " + f.first + " column not found"); - } - filters[col] = f.second; - filters[col].enabled = true; - } - - idColumnsOut[0] = col_ids[0]; - idColumnsOut[1] = col_ids[1]; - distanceColumnOut = col_distance; -} - -/*********************************************************************************************************************/ -size_t SparseMatrixNamed::load( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - distance_transformation_t transform, - std::map& columns2filters) { - - size_t n_total_distances = 0; - const size_t N = 512ULL << 20; // 128MB buffer - char* buf = new char[N]; - char* buf_end = buf + N; - - // get header - int col_ids[]{ 0, 1 }; - int col_distance = 2; // by default use 3rd column as the one with distance - std::vector filters; - processHeader(ifs, idColumns, distanceColumn, columns2filters, col_ids, col_distance, filters); - int n_columns = filters.size(); - - auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; - auto is_newline = [](char c) {return c == '\r' || c == '\n'; }; - - namesBuffer = new char[1LL << 30]; // 1 GB buffer for names - char* raw_ptr = namesBuffer; - - // assume space for 8M objects - distances.reserve(8LL << 20); - - bool continueReading = true; - char* place = buf; - - while (continueReading) { - size_t n_wanted = buf_end - place; - ifs.read(place, n_wanted); - size_t n_read = ifs.gcount(); - char* block_end = place + n_read; - int offset = 0; - - // no more data - if (n_read < n_wanted) { - continueReading = false; - } - else { - // find last newline - while (!is_newline(*(block_end - 1))) { - --block_end; - ++offset; - } - } - - //LOG_DEBUG << "portion: " << n_read << ", offset: " << offset << ", carryOn: " << carryOn << endl; - - // pass through buffer - char* line = buf; - while (true) { - char* line_end = std::find_if(line, block_end, is_newline); - - // no more lines - if (line_end == block_end) { - break; - } - - ++n_total_distances; - - char* p = line; - decltype(names2ids)::iterator its[2]; - double d = std::numeric_limits::max(); - - int k = 0; - bool carryOn = true; - - for (int c = 0; c < n_columns; ++c) { - char* q = std::find_if(p, line_end, is_sep); // support both tsv and csv files - *q = 0; - - if (k < 2 && c == col_ids[k]) { - char* name = p; - - // store name in hashtable - its[k] = names2ids.find(name); - - if (its[k] == names2ids.end()) { - - char* localName = raw_ptr; - while (*raw_ptr++ = *name++) {} - auto it_and_flag = names2ids.insert({ localName, {-1, names2ids.size()} }); // -1 indicate singleton - its[k] = it_and_flag.first; - } - - ++k; - } - else if (c == col_distance || filters[c].enabled) { - double value = Conversions::strtod(p, &p); - - if (c == col_distance) { - d = transform(value); // convert similarity to distance if neccessary - } - - // check distance condition - if (value < filters[c].min || value > filters[c].max) { - carryOn = false; - break; - } - } - - p = q + 1; - } - - // move to the next line - line = std::find_if(line_end, block_end, [](char c) { return c != '\r' && c != '\n' && c != 0; }); - - // do not consider rows not fulfilling conditions - if (carryOn == false) { - continue; - } - - // translate seq names to numerical ids - for (int k = 0; k < 2; ++k) { - auto it = its[k]; - - // if name not mapped to numerical ids - if (it->second.first == -1) { - ids2names.push_back(it->first); - it->second.first = ids2names.size() - 1; - } - } - - uint32_t i = std::min(its[0]->second.first, its[1]->second.first); - uint32_t j = std::max(its[0]->second.first, its[1]->second.first); - - // omit diagonal elements - they are assumed to have 0 distance - if (i == j) { - continue; - } - - if (distances.size() <= j) { - distances.resize(j + 1); - } - - auto& Di = distances[i]; - auto& Dj = distances[j]; - - // extend capacity by factor 1.5 with 16 as an initial state - if (Di.capacity() == Di.size()) { - Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); - } - - if (Dj.capacity() == Dj.size()) { - Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); - } - - Di.emplace_back(i, j, d); - Dj.emplace_back(j, i, d); - } - - // copy remaining part after consuming all the lines - if (continueReading && offset > 0) { - memcpy(buf, block_end, offset); - place = buf + offset; - } - else { - place = buf; - } - } - - // if neccessary, sort distances in rows according to the second id - n_elements = 0; - - for (auto& row : distances) { - std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); }); - auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); - - row.erase(newEnd, row.end()); - - n_elements += row.size(); - } - - delete[] buf; - - return n_total_distances; -} - -/*********************************************************************************************************************/ -int SparseMatrixNamed::saveAssignments( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator) { - - int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; - - vector names; - if (globalNames.empty()) { - names.resize(names2ids.size()); - vector>> entries(names2ids.size()); - copy(names2ids.begin(), names2ids.end(), entries.begin()); - sort(entries.begin(), entries.end(), [](const auto& a, const auto& b) { return a.second.second < b.second.second; }); - - transform(entries.begin(), entries.end(), names.begin(), [](const auto& entry) { return string(entry.first); }); - } - else { - names = globalNames; - } - - std::vector globalAssignments(names.size()); - - int singleton_id = n_clusters; - - for (size_t i = 0; i < names.size(); ++i) { - - int id = get_id(names[i].c_str()); - if (id == -1) { - // not in matrix - globalAssignments[i] = singleton_id++; - } - else { - globalAssignments[i] = assignments[id]; - } - } - - ofs << "object" << separator << "cluster" << endl; - - for (size_t i = 0; i < names.size(); ++i) { - ofs << names[i] << separator << globalAssignments[i] << endl; - } - - return singleton_id; -} - -/*********************************************************************************************************************/ -int SparseMatrixNamed::saveRepresentatives( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator -) { - int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; - - std::vector names; - if (globalNames.empty()) { - names.resize(names2ids.size()); - vector>> entries(names2ids.size()); - copy(names2ids.begin(), names2ids.end(), entries.begin()); - sort(entries.begin(), entries.end(), [](const auto& a, const auto& b) { return a.second.second < b.second.second; }); - - transform(entries.begin(), entries.end(), names.begin(), [](const auto& entry) { return string(entry.first); }); - } - else { - names = globalNames; - } - - std::unordered_map names2globalIds; - - for (size_t i = 0; i < names.size(); ++i) { - names2globalIds[names[i]] = i; - } - - std::vector lowestClusterMembers(n_clusters, std::numeric_limits::max()); - - for (size_t i = 0; i < assignments.size(); ++i) { - // translate element ids - string name = get_name(i); - int global_id = names2globalIds[name]; - int cluster_id = assignments[i]; - - if (global_id < lowestClusterMembers[cluster_id]) { - lowestClusterMembers[cluster_id] = global_id; - } - } - - std::vector representatives(names.size()); - - int n_singletons = 0; - for (size_t i = 0; i < names.size(); ++i) { - - int id = get_id(names[i].c_str()); - if (id == -1) { - // not in matrix - own representative - representatives[i] = &names[i]; - ++n_singletons; - // LOG_DEBUG << names[i] << endl; - } - else { - int cluster_id = assignments[id]; - int lowest_member = lowestClusterMembers[cluster_id]; - representatives[i] = &names[lowest_member]; - } - } - - ofs << "object" << separator << "cluster" << endl; - - for (size_t i = 0; i < names.size(); ++i) { - ofs << names[i] << separator << *representatives[i] << endl; - } - - return n_clusters + n_singletons; -} - - -/*********************************************************************************************************************/ -size_t SparseMatrixNumbered::load( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - distance_transformation_t transform, - std::map& columns2filters) { - - size_t n_total_distances = 0; - const size_t N = 512ULL << 20; - char* buf = new char[N]; - char* buf_end = buf + N - 1; - - // get header - int col_ids[]{ 0, 1 }; - int col_distance = 2; // by default use 3rd column as the one with distance - std::vector filters; - processHeader(ifs, idColumns, distanceColumn, columns2filters, col_ids, col_distance, filters); - int n_columns = filters.size(); - - auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; - auto is_newline = [](char c) {return c == '\r' || c == '\n'; }; - - // assume space for 8M objects - distances.reserve(8LL << 20); - global2local.reserve(8LL << 20); - local2global.reserve(8LL << 20); - - bool continueReading = true; - char* place = buf; - - while (continueReading) { - size_t n_wanted = buf_end - place; - ifs.read(place, n_wanted); - size_t n_read = ifs.gcount(); - char* block_end = place + n_read; - int offset = 0; - - // no more data - if (n_read < n_wanted) { - continueReading = false; - } - else { - // find last newline - while (!is_newline(*(block_end - 1))) { - --block_end; - ++offset; - } - } - - //LOG_DEBUG << "portion: " << n_read << ", offset: " << offset << ", carryOn: " << continueReading << endl; - - // pass through buffer - char* line = buf; - while (true) { - char* line_end = std::find_if(line, block_end, is_newline); - - // no more lines - if (line_end == block_end) { - break; - } - - ++n_total_distances; - - char* p = line; - double d = std::numeric_limits::max(); - int k = 0; - bool carryOn = true; - - int global_ids[2]; - - for (int c = 0; c < n_columns; ++c) { - char* q = std::find_if(p, line_end, is_sep); // support both tsv and csv files - *q = 0; - - if (k < 2 && c == col_ids[k]) { - global_ids[k] = Conversions::strtol(p, nullptr); - - if (global_ids[k] + 1 > (int)global2local.size()) { - global2local.resize(global_ids[k] + 1, -1); - } - - ++k; - } - else if (c == col_distance || filters[c].enabled) { - double value = Conversions::strtod(p, &p); - - if (c == col_distance) { - d = transform(value); // convert similarity to distance if neccessary - } - - // check distance condition - if (value < filters[c].min || value > filters[c].max) { - carryOn = false; - break; - } - } - - p = q + 1; - } - - // move to the next line - line = std::find_if(line_end, block_end, [](char c) { return c != '\r' && c != '\n' && c != 0; }); - - // do not consider rows not fulfilling conditions - if (carryOn == false) { - continue; - } - - int pair_ids[2]; - - - for (int k = 0; k < 2; ++k) { - - int gid = global_ids[k]; - int lid = global2local[gid]; - - if (lid == -1) { - lid = local2global.size(); - local2global.push_back(gid); - global2local[gid] = lid; - } - - pair_ids[k] = lid; - } - - uint32_t i = std::min(pair_ids[0], pair_ids[1]); - uint32_t j = std::max(pair_ids[0], pair_ids[1]); - - // omit diagonal elements - they are assumed to have 0 distance - if (i == j) { - continue; - } - - if (distances.size() <= j) { - distances.resize(j + 1); - } - - auto& Di = distances[i]; - auto& Dj = distances[j]; - - // extend capacity by factor 1.5 with 16 as an initial state - if (Di.capacity() == Di.size()) { - Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); - } - - if (Dj.capacity() == Dj.size()) { - Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); - } - - Di.emplace_back(i, j, d); - Dj.emplace_back(j, i, d); - } - - // copy remaining part after consuming all the lines - if (continueReading && offset > 0) { - memcpy(buf, block_end, offset); - place = buf + offset; - } - else { - place = buf; - } - } - - // if neccessary, sort distances in rows according to the second id - n_elements = 0; - - for (auto& row : distances) { - std::sort(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return (a.u.ids == b.u.ids) ? (a.d < b.d) : (a.u.ids < b.u.ids); }); - auto newEnd = std::unique(row.begin(), row.end(), [](const dist_t& a, const dist_t& b) { return a.u.ids == b.u.ids; }); - - row.erase(newEnd, row.end()); - n_elements += row.size(); - } - - delete[] buf; - - // Print distance histogram in the verbose mode - if (Log::getInstance(Log::LEVEL_VERBOSE).isEnabled()) { - - std::vector histo_bounds{ 0 }; - double width = 0.001; - - while (histo_bounds.back() < 0.05) - { - histo_bounds.push_back(histo_bounds.back() + width); - } - histo_bounds.push_back(std::numeric_limits::max()); - std::vector histo(histo_bounds.size()); - - for (auto& row : distances) { - for (const auto& e : row) { - for (size_t i = 0; i < histo_bounds.size(); ++i) { - if (e.d < histo_bounds[i]) { - ++histo[i]; - break; - } - } - } - } - - LOG_VERBOSE << endl << "Distance histogram" << endl; - for (size_t i = 0; i < histo_bounds.size(); ++i) { - LOG_VERBOSE << " d < " << histo_bounds[i] << ": " << histo[i] << endl; - } - LOG_VERBOSE << endl; - } - - return n_total_distances; -} - - -/*********************************************************************************************************************/ -int SparseMatrixNumbered::saveAssignments( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator) { - - int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; - int singleton_id = n_clusters; - - ofs << "object" << separator << "cluster" << endl; - - if (globalNames.empty()) { - for (size_t local_id = 0; local_id < local2global.size(); ++local_id) { - ofs << local2global[local_id] << "," << assignments[local_id] << endl; - } - } - else { - std::vector globalAssignments(globalNames.size()); - - - for (size_t i = 0; i < globalNames.size(); ++i) { - - int local_id = get_local_id(i); - if (local_id == -1) { - // not in matrix - globalAssignments[i] = singleton_id++; - } - else { - globalAssignments[i] = assignments[local_id]; - } - } - - for (size_t i = 0; i < globalNames.size(); ++i) { - ofs << globalNames[i] << separator << globalAssignments[i] << endl; - } - } - - return singleton_id; // return total number of clusters -} - - -/*********************************************************************************************************************/ -int SparseMatrixNumbered::saveRepresentatives( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator -) { - - int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; - - std::vector names; - if (globalNames.empty()) { - names.resize(global2local.size()); - int i = 0; - std::generate(names.begin(), names.end(), [&i]() { return std::to_string(i++); }); - } - else { - names = globalNames; - } - - std::unordered_map names2globalIds; - - for (size_t i = 0; i < names.size(); ++i) { - names2globalIds[names[i]] = i; - } - - std::vector lowestClusterMembers(n_clusters, std::numeric_limits::max()); - - for (int local_id = 0; local_id < (int)assignments.size(); ++local_id) { - // translate element ids - int global_id = local2global[local_id]; - int cluster_id = assignments[local_id]; - - if (global_id < lowestClusterMembers[cluster_id]) { - lowestClusterMembers[cluster_id] = global_id; - } - } - - std::vector representatives(names.size()); - - int n_singletons = 0; - for (size_t i = 0; i < names.size(); ++i) { - - int local_id = get_local_id(i); - if (local_id == -1) { - // not in matrix - own representative - representatives[i] = names[i]; - ++n_singletons; - // LOG_DEBUG << names[i] << endl; - } - else { - int cluster_id = assignments[local_id]; - int lowest_member = lowestClusterMembers[cluster_id]; - representatives[i] = names[lowest_member]; - } - } - - ofs << "object" << separator << "cluster" << endl; - - for (size_t i = 0; i < names.size(); ++i) { - ofs << names[i] << separator << representatives[i] << endl; - } - - return n_clusters + n_singletons; -} - -/*********************************************************************************************************************/ -void SparseMatrixNamed::print(std::ostream& out) { - - out.precision(10); - - std::vector names(names2ids.size()); - int i = 0; - for (auto q : names2ids) { - names[i] = q.first; - ++i; - } - - std::sort(names.begin(), names.end(), [](const char* a, const char* b) { - return strcmp(a, b) < 0; - }); - - for (auto name : names) { - - int i = names2ids[name].first; - std::vector& row = distances[i]; - - std::sort(row.begin(), row.end(), [this](const dist_t& a, const dist_t& b) { - return strcmp(ids2names[a.u.s.hi], ids2names[b.u.s.hi]) < 0; - }); - - for (auto& p : row) { - out << ids2names[p.u.s.lo] << "," << ids2names[p.u.s.hi] << "," << std::fixed << p.d << std::endl; - } - } -} - diff --git a/src/distances.h b/src/distances.h index 84e93c6..4521281 100644 --- a/src/distances.h +++ b/src/distances.h @@ -7,300 +7,58 @@ // Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz // // ******************************************************************************************* +#include "conversion.h" -#include -#include #include #include #include -#include -#include -#include -#include -#include +#include #include -#include "conversion.h" +/*********************************************************************************************************************/ using distance_transformation_t = std::function; -struct ColumnFilter { - double min{ std::numeric_limits::lowest() }; - double max{ std::numeric_limits::max() }; - bool enabled{ false }; -}; - -struct dist_t { - static const double MAX; - - union { - uint64_t ids; ///< index (id) - struct { - uint32_t lo; ///< numer wiersza - uint32_t hi; ///< numer kolumny - } s; - } u ; +/*********************************************************************************************************************/ +#pragma pack(push, 4) +class dist_t { - double d; ///< distance std::numeric_limits::max() - - dist_t() : d(std::numeric_limits::max()) { u.ids = 0; } - dist_t(uint64_t ids, double d) : d(d) { u.ids = ids; } - dist_t(uint64_t i, uint64_t j, double d) : d(d) { u.s.lo = i; u.s.hi = j;} - - dist_t(const std::pair& rhs) : d(rhs.second) { u.ids = rhs.first;} - - static uint64_t pack(uint64_t i, uint64_t j) { - if (i >= j) { - std::swap(i, j); - } - return (i << 32ULL) | j; - } - - static void unpack(uint64_t packed_ids, uint64_t& lo, uint64_t& hi) { - hi = packed_ids & 0xffffffff; - lo = packed_ids >> 32ULL; - } - - // this is to preserve consistency with similarity variant - // - increasingly by distance - // - decreasingly by id - bool operator<(const dist_t& rhs) const { - return (d == rhs.d) - ? (u.ids > rhs.u.ids) - : (d < rhs.d); - } - - bool operator<=(const dist_t& rhs) const { - return (d == rhs.d) - ? (u.ids >= rhs.u.ids) - : (d <= rhs.d); - } -}; - - + double d; ///< distance std::numeric_limits::max() + uint32_t id; // object identifier -class StringHasher { - std::hash hasher; public: - StringHasher() {} + dist_t() : d(std::numeric_limits::max()), id(0) {} + dist_t(uint32_t id, double d) : d(d), id(id) {} - size_t operator()(const char* s) const { - size_t hs = hasher(*s); - ++s; - while (*s) { - hs ^= hasher(*s); - ++s; - } - return hs; - } -}; - -class StringEqual { - -public: - StringEqual() {} + double get_d() const { return d; } + uint32_t get_id() const { return id; } - bool operator()(const char* a, const char* b) const { - return (std::strcmp(a, b) == 0); + bool operator<(const dist_t& rhs) const { + return (id == rhs.id) ? (d < rhs.d) : (id < rhs.id); } }; +#pragma pack(pop) - -class SparseMatrix -{ -protected: - - size_t n_elements{ 0 }; - - std::vector> distances; - - /** a vector of distances */ - //std::vector distances; - - /** Each row holds a pointer to the first distance in the row. - The last points address right after the collection. - */ - //std::vector rows; - +/*********************************************************************************************************************/ +class mini_dist_t { + uint32_t id; // object identifier public: + mini_dist_t() : id(0) {} + mini_dist_t(uint32_t id, double d) : id(id) {} - SparseMatrix() {} - - virtual ~SparseMatrix() {} - - size_t num_objects() const { return distances.size(); } - - size_t num_elements() const { return n_elements; } - - virtual size_t num_input_objects() const = 0; - - const dist_t* begin(int row_id) const { return distances[row_id].data(); } - const dist_t* end(int row_id) const { return distances[row_id].data() + distances[row_id].size(); } - - void clear(int row_id) { std::vector().swap(distances[row_id]); } - - size_t get_num_neighbours(int i) { return distances[i].size(); } + double get_d() const { return 0.0; } // mini_dist_t is always below threshold + uint32_t get_id() const { return id; } - dist_t get(uint64_t i, uint64_t j) const - { - dist_t v {i, j, dist_t::MAX }; - auto it = std::lower_bound(begin(i), end(i), v, [](const dist_t& a, const dist_t& b) { return a.u.s.hi < b.u.s.hi; }); - - return (it == end(i) || it->u.s.hi != j) ? v : *it; - } - - void extract_row(uint32_t row, uint32_t count, dist_t* out) const - { - const dist_t* cur = this->begin(row); - const dist_t* end = this->end(row); - - for (uint32_t i = 0; i < count; ++i) - { - if (i == row) { - out[i].u.s.lo = out[i].u.s.hi = row; - out[i].d = 0.0; // diagonal element - } - else if (cur == end || i < cur->u.s.hi) - { - // before current or after end - out[i].u.s.lo = row; - out[i].u.s.hi = i; - out[i].d = std::numeric_limits::max(); - } - else - { - out[i] = *cur; - ++cur; - } - } + bool operator<(const mini_dist_t& rhs) const { + return (id < rhs.id); } - - virtual size_t load( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - distance_transformation_t transform, - std::map& columns2filters) = 0; - - virtual int saveAssignments( - std::ofstream& ofs, - const std::vector& externalNames, - const std::vector& assignments, - char separator) = 0; - - virtual int saveRepresentatives( - std::ofstream& ofs, - const std::vector& externalNames, - const std::vector& assignments, - char separator) = 0; - - virtual void print(std::ostream& out) = 0; - -protected: - - void processHeader( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - std::map& columns2filters, - int idColumnsOut[2], - int& distanceColumnOut, - std::vector& filters); - }; - - -class SparseMatrixNamed : public SparseMatrix { - - using ids_pair_t = std::pair; - - std::unordered_map names2ids; - - std::vector ids2names; - - char* namesBuffer{ nullptr }; - +/*********************************************************************************************************************/ +class IMatrix { public: - ~SparseMatrixNamed() { - delete[] namesBuffer; - } - - size_t num_input_objects() const override { return names2ids.size(); } - - int get_id(const char* name) const { - auto it = names2ids.find((char*)name); - if (it == names2ids.end()) { - return -1; - } - else { - return it->second.first; - } - } - - const char* get_name(int id) const { return ids2names[id]; } - - size_t load( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - distance_transformation_t transform, - std::map& columns2filters) override; - - int saveAssignments( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator) override; - - int saveRepresentatives( - std::ofstream& ofs, - const std::vector& externalNames, - const std::vector& assignments, - char separator) override; - - void print(std::ostream& out) override; - + virtual ~IMatrix() {} }; -class SparseMatrixNumbered : public SparseMatrix { - - std::vector global2local; - std::vector local2global; - -public: - - int get_local_id(int global_id) { - if ((size_t)global_id > global2local.size() - 1) { - return -1; - } - else { - return global2local[global_id]; - } - } - - size_t num_input_objects() const override { return global2local.size(); } - - size_t load( - std::ifstream& ifs, - const std::pair& idColumns, - const std::string& distanceColumn, - distance_transformation_t transform, - std::map& columns2filters) override; - - int saveAssignments( - std::ofstream& ofs, - const std::vector& globalNames, - const std::vector& assignments, - char separator); - - int saveRepresentatives( - std::ofstream& ofs, - const std::vector& externalNames, - const std::vector& assignments, - char separator); - - void print(std::ostream& out) override {} -}; diff --git a/src/graph.cpp b/src/graph.cpp new file mode 100644 index 0000000..18e8aa3 --- /dev/null +++ b/src/graph.cpp @@ -0,0 +1,83 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* + +#include "graph.h" +#include "log.h" + +#include +#include +#include +#include +#include + +using namespace std; + +/*********************************************************************************************************************/ +void Graph::processHeader( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + const std::map& columns2filters, + int idColumnsOut[2], + int& distanceColumnOut, + std::vector& filters) { + + std::string line; + std::getline(ifs, line); + + std::replace(line.begin(), line.end(), ',', ' '); + std::istringstream iss(line); + std::vector columns; + std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(columns)); + + if (columns.size() < 3) { + throw std::runtime_error("Error loading distances: at least three columns are required"); + } + + // default column identifiers + int col_ids[]{ 0, 1 }; + int col_distance = 2; // by default use 3rd column as the one with distance + + int n_columns = (int)columns.size(); + + if (!idColumns.first.empty()) { + col_ids[0] = std::find(columns.begin(), columns.end(), idColumns.first) - columns.begin(); + col_ids[1] = std::find(columns.begin(), columns.end(), idColumns.second) - columns.begin(); + + if (col_ids[0] == n_columns || col_ids[1] == n_columns) { + throw std::runtime_error("Error loading distances: id columns not found"); + } + + if (col_ids[0] > col_ids[1]) { + std::swap(col_ids[0], col_ids[1]); + } + } + + if (!distanceColumn.empty()) { + col_distance = std::find(columns.begin(), columns.end(), distanceColumn) - columns.begin(); + if (col_distance == n_columns) { + throw std::runtime_error("Error loading distances: " + distanceColumn + " column not found"); + } + } + + filters.resize(columns.size()); + + for (const auto& f : columns2filters) { + int col = std::find(columns.begin(), columns.end(), f.first) - columns.begin(); + if (col == n_columns) { + throw std::runtime_error("Error loading distances: " + f.first + " column not found"); + } + filters[col] = f.second; + filters[col].enabled = true; + } + + idColumnsOut[0] = col_ids[0]; + idColumnsOut[1] = col_ids[1]; + distanceColumnOut = col_distance; +} + diff --git a/src/graph.h b/src/graph.h new file mode 100644 index 0000000..6f65ee1 --- /dev/null +++ b/src/graph.h @@ -0,0 +1,73 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* +#pragma once +#include "distances.h" + +#include +#include +#include +#include +#include + +// *******************************************************************************************/ +struct ColumnFilter { + double min{ std::numeric_limits::lowest() }; + double max{ std::numeric_limits::max() }; + bool enabled{ false }; +}; + +// *******************************************************************************************/ +class Graph { + +public: + virtual IMatrix& getMatrix() = 0; + + virtual size_t getNumVertices() const = 0; + + virtual size_t getNumInputVertices() const = 0; + + virtual size_t getNumEdges() const = 0; + + virtual size_t load( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + distance_transformation_t transform, + const std::map& columns2filters) = 0; + + virtual int saveAssignments( + std::ofstream& ofs, + const std::vector& externalNames, + const std::vector& assignments, + char separator) const = 0; + + virtual int saveRepresentatives( + std::ofstream& ofs, + const std::vector& externalNames, + const std::vector& assignments, + char separator) const = 0; + + virtual void reorderObjects( + const std::vector& externalNames, + std::vector& objects) const = 0; + + virtual void print(std::ostream& out) const = 0; + + virtual ~Graph() {} + +protected: + + void processHeader( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + const std::map& columns2filters, + int idColumnsOut[2], + int& distanceColumnOut, + std::vector& filters); +}; \ No newline at end of file diff --git a/src/graph_named.h b/src/graph_named.h new file mode 100644 index 0000000..22b3c85 --- /dev/null +++ b/src/graph_named.h @@ -0,0 +1,456 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* +#pragma once +#include "sparse_matrix.h" +#include "graph.h" + +#include +#include +#include +#include + + + +// ******************************************************************************************* +class StringHasher { + std::hash hasher; +public: + StringHasher() {} + + size_t operator()(const char* s) const { + size_t hs = hasher(*s); + ++s; + while (*s) { + hs ^= hasher(*s); + ++s; + } + return hs; + } +}; + +// ******************************************************************************************* +class StringEqual { + +public: + StringEqual() {} + + bool operator()(const char* a, const char* b) const { + return (std::strcmp(a, b) == 0); + } +}; + +/*********************************************************************************************************************/ +template +class GraphNamed : public Graph { + + using ids_pair_t = std::pair; + + SparseMatrix matrix; + + std::unordered_map names2ids; + + std::vector ids2names; + + char* namesBuffer{ nullptr }; + +public: + ~GraphNamed() { + delete[] namesBuffer; + } + + IMatrix& getMatrix() override { return matrix; } + + size_t getNumVertices() const override { return matrix.num_objects(); } + + size_t getNumInputVertices() const override { return this->names2ids.size(); } + + size_t getNumEdges() const override { return matrix.num_elements(); } + + void reorderObjects( + const std::vector& externalNames, + std::vector& objects) const override { + + int obj_id = 0; + for (const std::string& name : externalNames) { + int local_id = get_id(name.c_str()); + if (local_id != -1) { + objects[obj_id++] = local_id; + } + } + } + + size_t load( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + distance_transformation_t transform, + const std::map& columns2filters) override; + + int saveAssignments( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const override; + + int saveRepresentatives( + std::ofstream& ofs, + const std::vector& externalNames, + const std::vector& assignments, + char separator) const override; + + void print(std::ostream& out) const override; + +protected: + + int get_id(const char* name) const { + auto it = names2ids.find((char*)name); + if (it == names2ids.end()) { + return -1; + } + else { + return it->second.first; + } + } +}; + + + +/*********************************************************************************************************************/ +template +size_t GraphNamed::load( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + distance_transformation_t transform, + const std::map& columns2filters) { + + size_t n_total_distances = 0; + const size_t N = 512ULL << 20; // 128MB buffer + char* buf = new char[N]; + char* buf_end = buf + N; + + // get header + int col_ids[]{ 0, 1 }; + int col_distance = 2; // by default use 3rd column as the one with distance + std::vector filters; + processHeader(ifs, idColumns, distanceColumn, columns2filters, col_ids, col_distance, filters); + int n_columns = filters.size(); + + auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; + auto is_newline = [](char c) {return c == '\r' || c == '\n'; }; + + namesBuffer = new char[1LL << 30]; // 1 GB buffer for names + char* raw_ptr = namesBuffer; + + // assume space for 8M objects + matrix.distances.reserve(8LL << 20); + + bool continueReading = true; + char* place = buf; + + while (continueReading) { + size_t n_wanted = buf_end - place; + ifs.read(place, n_wanted); + size_t n_read = ifs.gcount(); + char* block_end = place + n_read; + int offset = 0; + + // no more data + if (n_read < n_wanted) { + continueReading = false; + } + else { + // find last newline + while (!is_newline(*(block_end - 1))) { + --block_end; + ++offset; + } + } + + //LOG_DEBUG << "portion: " << n_read << ", offset: " << offset << ", carryOn: " << carryOn << endl; + + // pass through buffer + char* line = buf; + while (true) { + char* line_end = std::find_if(line, block_end, is_newline); + + // no more lines + if (line_end == block_end) { + break; + } + + ++n_total_distances; + + char* p = line; + decltype(names2ids.begin()) its[2]; + double d = std::numeric_limits::max(); + + int k = 0; + bool carryOn = true; + + for (int c = 0; c < n_columns; ++c) { + char* q = std::find_if(p, line_end, is_sep); // support both tsv and csv files + *q = 0; + + if (k < 2 && c == col_ids[k]) { + char* name = p; + + // store name in hashtable + its[k] = names2ids.find(name); + + if (its[k] == names2ids.end()) { + + char* localName = raw_ptr; + while (*raw_ptr++ = *name++) {} + auto it_and_flag = names2ids.insert({ localName, {-1, names2ids.size()} }); // -1 indicate singleton + its[k] = it_and_flag.first; + } + + ++k; + } + else if (c == col_distance || filters[c].enabled) { + double value = Conversions::strtod(p, &p); + + if (c == col_distance) { + d = transform(value); // convert similarity to distance if neccessary + } + + // check distance condition + if (value < filters[c].min || value > filters[c].max) { + carryOn = false; + break; + } + } + + p = q + 1; + } + + // move to the next line + line = std::find_if(line_end, block_end, [](char c) { return c != '\r' && c != '\n' && c != 0; }); + + // do not consider rows not fulfilling conditions + if (carryOn == false) { + continue; + } + + // translate seq names to numerical ids + for (int k = 0; k < 2; ++k) { + auto it = its[k]; + + // if name not mapped to numerical ids + if (it->second.first == -1) { + ids2names.push_back(it->first); + it->second.first = ids2names.size() - 1; + } + } + + uint32_t i = std::min(its[0]->second.first, its[1]->second.first); + uint32_t j = std::max(its[0]->second.first, its[1]->second.first); + + // omit diagonal elements - they are assumed to have 0 distance + if (i == j) { + continue; + } + + if (matrix.distances.size() <= j) { + matrix.distances.resize(j + 1); + } + + auto& Di = matrix.distances[i]; + auto& Dj = matrix.distances[j]; + + // extend capacity by factor 1.5 with 16 as an initial state + if (Di.capacity() == Di.size()) { + Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); + } + + if (Dj.capacity() == Dj.size()) { + Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); + } + + Di.emplace_back(j, d); + Dj.emplace_back(i, d); + } + + // copy remaining part after consuming all the lines + if (continueReading && offset > 0) { + memcpy(buf, block_end, offset); + place = buf + offset; + } + else { + place = buf; + } + } + + // if neccessary, sort distances in rows according to the second id + matrix.n_elements = 0; + + for (auto& row : matrix.distances) { + std::sort(row.begin(), row.end()); + auto newEnd = std::unique(row.begin(), row.end(), [](const Distance& a, const Distance& b) { return a.get_id() == b.get_id(); }); + + row.erase(newEnd, row.end()); + + matrix.n_elements += row.size(); + } + + delete[] buf; + + return n_total_distances; +} + + + +/*********************************************************************************************************************/ +template +int GraphNamed::saveRepresentatives( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const +{ + int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; + + std::vector names; + if (globalNames.empty()) { + names.resize(names2ids.size()); + std::vector>> entries(names2ids.size()); + std::copy(names2ids.begin(), names2ids.end(), entries.begin()); + std::sort(entries.begin(), entries.end(), [](const auto& a, const auto& b) { return a.second.second < b.second.second; }); + + std::transform(entries.begin(), entries.end(), names.begin(), [](const auto& entry) { return std::string(entry.first); }); + } + else { + names = globalNames; + } + + std::unordered_map names2globalIds; + + for (size_t i = 0; i < names.size(); ++i) { + names2globalIds[names[i]] = i; + } + + std::vector lowestClusterMembers(n_clusters, std::numeric_limits::max()); + + for (size_t i = 0; i < assignments.size(); ++i) { + // translate element ids + std::string name = ids2names[i]; + int global_id = names2globalIds[name]; + int cluster_id = assignments[i]; + + if (global_id < lowestClusterMembers[cluster_id]) { + lowestClusterMembers[cluster_id] = global_id; + } + } + + std::vector representatives(names.size()); + + int n_singletons = 0; + for (size_t i = 0; i < names.size(); ++i) { + + int id = get_id(names[i].c_str()); + if (id == -1) { + // not in matrix - own representative + representatives[i] = &names[i]; + ++n_singletons; + // LOG_DEBUG << names[i] << endl; + } + else { + int cluster_id = assignments[id]; + int lowest_member = lowestClusterMembers[cluster_id]; + representatives[i] = &names[lowest_member]; + } + } + + ofs << "object" << separator << "cluster" << std::endl; + + for (size_t i = 0; i < names.size(); ++i) { + ofs << names[i] << separator << *representatives[i] << std::endl; + } + + return n_clusters + n_singletons; +} + + + +/*********************************************************************************************************************/ +template +int GraphNamed::saveAssignments( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const { + + int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; + + std::vector names; + if (globalNames.empty()) { + names.resize(names2ids.size()); + std::vector>> entries(names2ids.size()); + std::copy(names2ids.begin(), names2ids.end(), entries.begin()); + std::sort(entries.begin(), entries.end(), [](const auto& a, const auto& b) { return a.second.second < b.second.second; }); + + std::transform(entries.begin(), entries.end(), names.begin(), [](const auto& entry) { return std::string(entry.first); }); + } + else { + names = globalNames; + } + + std::vector globalAssignments(names.size()); + + int singleton_id = n_clusters; + + for (size_t i = 0; i < names.size(); ++i) { + + int id = get_id(names[i].c_str()); + if (id == -1) { + // not in matrix + globalAssignments[i] = singleton_id++; + } + else { + globalAssignments[i] = assignments[id]; + } + } + + ofs << "object" << separator << "cluster" << std::endl; + + for (size_t i = 0; i < names.size(); ++i) { + ofs << names[i] << separator << globalAssignments[i] << std::endl; + } + + return singleton_id; +} + +/*********************************************************************************************************************/ +template +void GraphNamed::print(std::ostream& out) const { + + out.precision(10); + + std::vector names(names2ids.size()); + int i = 0; + for (auto q : names2ids) { + names[i] = q.first; + ++i; + } + + std::sort(names.begin(), names.end(), [](const char* a, const char* b) { + return strcmp(a, b) < 0; + }); + + for (auto name : names) { + + int i = names2ids.at(name).first; + const std::vector& row = matrix.distances[i]; + + for (auto& p : row) { + out << ids2names[i] << "," << ids2names[p.get_id()] << "," << std::fixed << p.get_d() << std::endl; + } + } +} + diff --git a/src/graph_numbered.h b/src/graph_numbered.h new file mode 100644 index 0000000..d3f97b2 --- /dev/null +++ b/src/graph_numbered.h @@ -0,0 +1,407 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* +#pragma once +#include "sparse_matrix.h" +#include "graph.h" +#include "log.h" + +#include +#include +#include +#include +#include + + +template +class GraphNumbered : public Graph { + + std::vector global2local; + std::vector local2global; + + SparseMatrix matrix; + +public: + + IMatrix& getMatrix() override { return matrix; } + + size_t getNumVertices() const override { return matrix.num_objects(); } + + size_t getNumInputVertices() const override { return global2local.size(); } + + size_t getNumEdges() const override { return matrix.num_elements(); } + + void reorderObjects( + const std::vector& externalNames, + std::vector& objects) const override { + + int obj_id = 0; + for (size_t i = 0; i < externalNames.size(); ++i) { + int local_id = get_local_id(i); + if (local_id != -1) { + objects[obj_id++] = local_id; + } + } + } + + size_t load( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + distance_transformation_t transform, + const std::map& columns2filters) override; + + int saveAssignments( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const override; + + int saveRepresentatives( + std::ofstream& ofs, + const std::vector& externalNames, + const std::vector& assignments, + char separator) const override; + + void print(std::ostream& out) const override {} + +protected: + int get_local_id(int global_id) const { + if ((size_t)global_id > global2local.size() - 1) { + return -1; + } + else { + return global2local[global_id]; + } + } +}; + + + +/*********************************************************************************************************************/ +template +size_t GraphNumbered::load( + std::ifstream& ifs, + const std::pair& idColumns, + const std::string& distanceColumn, + distance_transformation_t transform, + const std::map& columns2filters) { + + size_t n_total_distances = 0; + const size_t N = 512ULL << 20; + char* buf = new char[N]; + char* buf_end = buf + N - 1; + + // get header + int col_ids[]{ 0, 1 }; + int col_distance = 2; // by default use 3rd column as the one with distance + std::vector filters; + processHeader(ifs, idColumns, distanceColumn, columns2filters, col_ids, col_distance, filters); + int n_columns = filters.size(); + + auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; + auto is_newline = [](char c) {return c == '\r' || c == '\n'; }; + + // assume space for 8M objects + matrix.distances.reserve(8LL << 20); + global2local.reserve(8LL << 20); + local2global.reserve(8LL << 20); + + bool continueReading = true; + char* place = buf; + + while (continueReading) { + size_t n_wanted = buf_end - place; + ifs.read(place, n_wanted); + size_t n_read = ifs.gcount(); + char* block_end = place + n_read; + int offset = 0; + + // no more data + if (n_read < n_wanted) { + continueReading = false; + } + else { + // find last newline + while (!is_newline(*(block_end - 1))) { + --block_end; + ++offset; + } + } + + //LOG_DEBUG << "portion: " << n_read << ", offset: " << offset << ", carryOn: " << continueReading << endl; + + // pass through buffer + char* line = buf; + while (true) { + char* line_end = std::find_if(line, block_end, is_newline); + + // no more lines + if (line_end == block_end) { + break; + } + + ++n_total_distances; + + char* p = line; + double d = std::numeric_limits::max(); + int k = 0; + bool carryOn = true; + + int global_ids[2]; + + for (int c = 0; c < n_columns; ++c) { + char* q = std::find_if(p, line_end, is_sep); // support both tsv and csv files + *q = 0; + + if (k < 2 && c == col_ids[k]) { + global_ids[k] = Conversions::strtol(p, nullptr); + + if (global_ids[k] + 1 > (int)global2local.size()) { + global2local.resize(global_ids[k] + 1, -1); + } + + ++k; + } + else if (c == col_distance || filters[c].enabled) { + double value = Conversions::strtod(p, &p); + + if (c == col_distance) { + d = transform(value); // convert similarity to distance if neccessary + } + + // check distance condition + if (value < filters[c].min || value > filters[c].max) { + carryOn = false; + break; + } + } + + p = q + 1; + } + + // move to the next line + line = std::find_if(line_end, block_end, [](char c) { return c != '\r' && c != '\n' && c != 0; }); + + // do not consider rows not fulfilling conditions + if (carryOn == false) { + continue; + } + + int pair_ids[2]; + + + for (int k = 0; k < 2; ++k) { + + int gid = global_ids[k]; + int lid = global2local[gid]; + + if (lid == -1) { + lid = local2global.size(); + local2global.push_back(gid); + global2local[gid] = lid; + } + + pair_ids[k] = lid; + } + + uint32_t i = std::min(pair_ids[0], pair_ids[1]); + uint32_t j = std::max(pair_ids[0], pair_ids[1]); + + // omit diagonal elements - they are assumed to have 0 distance + if (i == j) { + continue; + } + + if (matrix.distances.size() <= j) { + matrix.distances.resize(j + 1); + } + + auto& Di = matrix.distances[i]; + auto& Dj = matrix.distances[j]; + + // extend capacity by factor 1.5 with 16 as an initial state + if (Di.capacity() == Di.size()) { + Di.reserve(Di.capacity() == 0 ? 16 : size_t(Di.capacity() * 1.5)); + } + + if (Dj.capacity() == Dj.size()) { + Dj.reserve(Dj.capacity() == 0 ? 16 : size_t(Dj.capacity() * 1.5)); + } + + Di.emplace_back(j, d); + Dj.emplace_back(i, d); + } + + // copy remaining part after consuming all the lines + if (continueReading && offset > 0) { + memcpy(buf, block_end, offset); + place = buf + offset; + } + else { + place = buf; + } + } + + // if neccessary, sort distances in rows according to the second id + matrix.n_elements = 0; + + for (auto& row : matrix.distances) { + std::sort(row.begin(), row.end()); + auto newEnd = std::unique(row.begin(), row.end(), [](const Distance& a, const Distance& b) { return a.get_id() == b.get_id(); }); + + row.erase(newEnd, row.end()); + matrix.n_elements += row.size(); + } + + delete[] buf; + + // Print distance histogram in the verbose mode + if (Log::getInstance(Log::LEVEL_VERBOSE).isEnabled()) { + + std::vector histo_bounds{ 0 }; + double width = 0.001; + + while (histo_bounds.back() < 0.05) + { + histo_bounds.push_back(histo_bounds.back() + width); + } + histo_bounds.push_back(std::numeric_limits::max()); + std::vector histo(histo_bounds.size()); + + for (auto& row : matrix.distances) { + for (const auto& e : row) { + for (size_t i = 0; i < histo_bounds.size(); ++i) { + if (e.get_d() < histo_bounds[i]) { + ++histo[i]; + break; + } + } + } + } + + LOG_VERBOSE << std::endl << "Distance histogram" << std::endl; + for (size_t i = 0; i < histo_bounds.size(); ++i) { + LOG_VERBOSE << " d < " << histo_bounds[i] << ": " << histo[i] << std::endl; + } + LOG_VERBOSE << std::endl; + } + + return n_total_distances; +} + + +/*********************************************************************************************************************/ +template +int GraphNumbered::saveRepresentatives( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const +{ + + int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; + + std::vector names; + if (globalNames.empty()) { + names.resize(global2local.size()); + int i = 0; + std::generate(names.begin(), names.end(), [&i]() { return std::to_string(i++); }); + } + else { + names = globalNames; + } + + std::unordered_map names2globalIds; + + for (size_t i = 0; i < names.size(); ++i) { + names2globalIds[names[i]] = i; + } + + std::vector lowestClusterMembers(n_clusters, std::numeric_limits::max()); + + for (int local_id = 0; local_id < (int)assignments.size(); ++local_id) { + // translate element ids + int global_id = local2global[local_id]; + int cluster_id = assignments[local_id]; + + if (global_id < lowestClusterMembers[cluster_id]) { + lowestClusterMembers[cluster_id] = global_id; + } + } + + std::vector representatives(names.size()); + + int n_singletons = 0; + for (size_t i = 0; i < names.size(); ++i) { + + int local_id = get_local_id(i); + if (local_id == -1) { + // not in matrix - own representative + representatives[i] = names[i]; + ++n_singletons; + // LOG_DEBUG << names[i] << endl; + } + else { + int cluster_id = assignments[local_id]; + int lowest_member = lowestClusterMembers[cluster_id]; + representatives[i] = names[lowest_member]; + } + } + + ofs << "object" << separator << "cluster" << std::endl; + + for (size_t i = 0; i < names.size(); ++i) { + ofs << names[i] << separator << representatives[i] << std::endl; + } + + return n_clusters + n_singletons; +} + + +/*********************************************************************************************************************/ +template +int GraphNumbered::saveAssignments( + std::ofstream& ofs, + const std::vector& globalNames, + const std::vector& assignments, + char separator) const { + + int n_clusters = *std::max_element(assignments.begin(), assignments.end()) + 1; + int singleton_id = n_clusters; + + ofs << "object" << separator << "cluster" << std::endl; + + if (globalNames.empty()) { + for (size_t local_id = 0; local_id < local2global.size(); ++local_id) { + ofs << local2global[local_id] << "," << assignments[local_id] << std::endl; + } + } + else { + std::vector globalAssignments(globalNames.size()); + + + for (size_t i = 0; i < globalNames.size(); ++i) { + + int local_id = get_local_id(i); + if (local_id == -1) { + // not in matrix + globalAssignments[i] = singleton_id++; + } + else { + globalAssignments[i] = assignments[local_id]; + } + } + + for (size_t i = 0; i < globalNames.size(); ++i) { + ofs << globalNames[i] << separator << globalAssignments[i] << std::endl; + } + } + + return singleton_id; // return total number of clusters +} + diff --git a/src/leiden.h b/src/leiden.h index a3caf90..937ed59 100644 --- a/src/leiden.h +++ b/src/leiden.h @@ -11,6 +11,7 @@ #include "clustering.h" #include + #ifndef NO_LEIDEN #include #endif @@ -22,8 +23,9 @@ struct LeidenParams { int numIterations{ 2 }; }; -template -class Leiden : public IClustering { + +template +class Leiden : public IClustering { private: LeidenParams params; @@ -38,7 +40,7 @@ class Leiden : public IClustering { } int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) override { @@ -51,7 +53,7 @@ class Leiden : public IClustering { Leiden(const LeidenParams& params) : params(params) {} int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) override { @@ -80,7 +82,7 @@ class Leiden : public IClustering { } - void load_graph(const DistanceMatrix& matrix, igraph_t& g, igraph_vector_t& edge_weights) { + void load_graph(SparseMatrix& matrix, igraph_t& g, igraph_vector_t& edge_weights) { igraph_vector_int_t edges; igraph_vector_int_init(&edges, 0); @@ -88,13 +90,13 @@ class Leiden : public IClustering { igraph_vector_init(&edge_weights, 0); for (int i = 0; i < matrix.num_objects(); ++i) { - for (const dist_t* edge = matrix.begin(i); edge < matrix.end(i); ++edge) { - igraph_vector_int_push_back(&edges, edge->u.s.lo); - igraph_vector_int_push_back(&edges, edge->u.s.hi); - igraph_vector_push_back(&edge_weights, 1.0 - edge->d); + for (const Distance* edge = matrix.begin(i); edge < matrix.end(i); ++edge) { + igraph_vector_int_push_back(&edges, i); + igraph_vector_int_push_back(&edges, edge->get_id()); + igraph_vector_push_back(&edge_weights, 1.0 - edge->get_d()); } - matrix.clear(i); + matrix.clear_row(i); } /* diff --git a/src/linkage_heaptrix.h b/src/linkage_heaptrix.h index a24f71f..e55f72d 100644 --- a/src/linkage_heaptrix.h +++ b/src/linkage_heaptrix.h @@ -42,12 +42,12 @@ namespace linkage_algorithm_heaptrix std::vector groups; }; - std::ostream & operator << (std::ostream & sos, const group & g) + inline std::ostream & operator << (std::ostream & sos, const group & g) { return sos << g.id << ": d(" << g.left << ", " << g.right << ") == " << g.distance; } - std::ostream & operator << (std::ostream & sos, const dendrogram & d) + inline std::ostream & operator << (std::ostream & sos, const dendrogram & d) { sos << "--------------------" << std::endl; for (const auto g : d.groups) @@ -59,8 +59,8 @@ namespace linkage_algorithm_heaptrix } - template - class linkage : public HierarchicalClustering + template + class linkage : public HierarchicalClustering { const std::size_t MAX_SIZE_T = std::numeric_limits::max(); // Index of the location in the heap. MAX means no index set. const double MAX_DOUBLE = std::numeric_limits::max(); @@ -68,14 +68,15 @@ namespace linkage_algorithm_heaptrix const std::string MAX_LABEL {"--"}; /*** An element of a matrix and a heap simultaneously. */ + #pragma pack(push, 8) struct element { double _value; ///< stored value - std::size_t _row; ///< index of a row in the matrix - std::size_t _column; ///< index of a column in the matrix - std::size_t _index_heap; ///< index of an element in the heap + int32_t _row; ///< index of a row in the matrix + int32_t _column; ///< index of a column in the matrix + std::size_t _index_heap; ///< index of an element in the heap - element (const std::size_t row, const std::size_t column, const std::size_t index_heap, const double value) + element (const int32_t row, const int32_t column, const std::size_t index_heap, const double value) { _row = row; _column = column; @@ -111,6 +112,7 @@ namespace linkage_algorithm_heaptrix sos << "{(" << _row << ", " << _column << ") [" << _index_heap << "] " << _value << "}"; } }; + #pragma pack(pop) class matrix_row_ht { @@ -766,7 +768,7 @@ namespace linkage_algorithm_heaptrix } public: - void read_matrix (DistanceMatrix & m) + void read_matrix (SparseMatrix & m) { _matrix._rows.clear(); @@ -774,13 +776,13 @@ namespace linkage_algorithm_heaptrix for (size_t i = 0; i < m.num_objects(); ++i) { - for (const dist_t* p = m.begin(i); p < m.end(i); ++p) { - const dist_t& edge = *p; - if (edge.d != MAX_DOUBLE and edge.d != INF_DOUBLE and edge.u.s.lo != edge.u.s.hi) - add_value(edge.u.s.lo, edge.u.s.hi, edge.d); + for (const Distance* p = m.begin(i); p < m.end(i); ++p) { + const Distance& edge = *p; + if (edge.get_d() != MAX_DOUBLE and edge.get_d() != INF_DOUBLE and i != edge.get_id()) + add_value(i, edge.get_id(), edge.get_d()); } - m.clear(i); + m.clear_row(i); } _heap.make_heap(); @@ -988,7 +990,7 @@ namespace linkage_algorithm_heaptrix */ public: - dendrogram do_clustering (DistanceMatrix & m) + dendrogram do_clustering (SparseMatrix & m) { ksi::clock stopwatch; LOG_VERBOSE << "Loading data into heap "; @@ -1021,7 +1023,7 @@ namespace linkage_algorithm_heaptrix public: int run ( - DistanceMatrix& matrix, + SparseMatrix& matrix, const std::vector& objects, double threshold, std::vector& assignments @@ -1064,18 +1066,19 @@ namespace linkage_algorithm_heaptrix }; - template > - class complete_linkage : public linkage + template > + class complete_linkage : public linkage { }; - template - class single_linkage : public linkage> + template + class single_linkage : public linkage> { }; } -std::ostream & operator << (std::ostream & sos, const std::vector> & dist) + +inline std::ostream & operator << (std::ostream & sos, const std::vector> & dist) { for (const auto& [w, k, d] : dist) sos << "[" << w << ", " << k << ", " << d << "] "; @@ -1083,12 +1086,12 @@ std::ostream & operator << (std::ostream & sos, const std::vector -class SingleLinkage : public linkage_algorithm_heaptrix::single_linkage +template +class SingleLinkage : public linkage_algorithm_heaptrix::single_linkage { int operator() ( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments @@ -1099,12 +1102,12 @@ class SingleLinkage : public linkage_algorithm_heaptrix::single_linkage -class CompleteLinkage : public linkage_algorithm_heaptrix::complete_linkage +template +class CompleteLinkage : public linkage_algorithm_heaptrix::complete_linkage { int operator() ( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments diff --git a/src/main.cpp b/src/main.cpp index cdfd931..1794659 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,200 +5,35 @@ // Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz // // ******************************************************************************************* - -#include -#include -#include -#include -#include -#include -#include - -#include "linkage_heaptrix.h" -#include "uclust.h" -#include "set_cover.h" -#include "single_bfs.h" -#include "cd_hit.h" -#include "leiden.h" - #include "console.h" #include "log.h" -#define VAL(str) #str -#define TOSTRING(str) VAL(str) -#include "version.h" +#include +#include using namespace std; -#define debug(x) std::cerr << __FILE__ << " (" << __LINE__ << ") " << #x << " == " << (x) << std::endl - - int main(int argc, char** argv) { - - Log::getInstance(Log::LEVEL_NORMAL).enable(); - try { - LOG_NORMAL << "Clusty" << endl - << " version " << VERSION -#ifdef GIT_COMMIT - << "-" << TOSTRING(GIT_COMMIT) -#endif - << " (" << DATE << ")" << endl << endl; - - Console console; - - if (!console.parse(argc, argv)) { - console.printUsage(); - return 0; - } - - if (console.verbose) { - Log::getInstance(Log::LEVEL_VERBOSE).enable(); - } - - shared_ptr dists; - std::shared_ptr> clustering; - - switch (console.algo) - { - case Algo::SingleLinkage: - clustering = make_shared>(); break; - case Algo::CompleteLinkage: - clustering = make_shared>(); break; - case Algo::UClust: - clustering = make_shared>(); break; - case Algo::SetCover: - clustering = make_shared>(); break; - case Algo::CdHit: - clustering = make_shared>(); break; - case Algo::Leiden: - clustering = make_shared>(console.leidenParams); break; - default: - throw std::runtime_error("Unkown clustering algorithm"); - } - LOG_NORMAL << "Loading pairwise distances from " << console.distancesFile << "... "; - auto t = std::chrono::high_resolution_clock::now(); - - vector filebuf(128ULL << 20); // 128MB buffer - ifstream ifs; - ifs.rdbuf()->pubsetbuf(filebuf.data(), filebuf.size()); - ifs.open(console.distancesFile, ios_base::binary); - - if (!ifs) { - throw std::runtime_error("Unable to open distance file"); - } - - if (console.numericIds) { - dists = make_shared(); - } - else { - dists = make_shared(); - } - - map transforms { - { DistanceSpecification::Distance, [](double d) { return d; } }, - { DistanceSpecification::Similarity, [](double d) { return 1.0 - d; } }, - { DistanceSpecification::PercentSimilarity, [](double d) { return 1.0 - d * 0.01; } }, - }; - - size_t n_total_dists = dists->load(ifs, console.idColumns, console.distanceColumn, - transforms[console.distanceSpecification], console.columns2filters); - auto dt = std::chrono::high_resolution_clock::now() - t; - - ifs.close(); - LOG_NORMAL << endl - << " input graph: " << dists->num_input_objects() << " nodes, " << n_total_dists << " edges" << endl - << " filtered graph: " << dists->num_objects() << " nodes, " << dists->num_elements() << " edges" << endl - << " time [s]: " << chrono::duration(dt).count() << endl; - - vector names; - vector objects(dists->num_objects()); - std::iota(objects.begin(), objects.end(), 0); - - if (!console.objectsFile.empty()) { - LOG_NORMAL << "Loading objects from " << console.objectsFile << "... "; - ifs.open(console.objectsFile); - - if (ifs) { - t = std::chrono::high_resolution_clock::now(); - string token; - token.reserve(16 << 10); // 16k buffer - - // omit header - getline(ifs, token); - auto is_sep = [](char c) {return c == ',' || c == '\t' || c == '\r' || c == '\t'; }; + Console console; + Params params; - while (getline(ifs, token)) { - auto it = find_if(token.begin(), token.end(), is_sep); - if (it != token.begin()) { - names.emplace_back(token.begin(), it); - } - } + std::vector objects; + std::vector names; + std::vector assignments; - ifs.close(); - LOG_NORMAL << endl; - - // reorder sequences according to the objects file - if (auto m = dynamic_pointer_cast(dists)) { - int obj_id = 0; - for (const string& name : names) { - int local_id = m->get_id(name.c_str()); - if (local_id != -1) { - objects[obj_id++] = local_id; - } - } - } - else { - auto q = dynamic_pointer_cast(dists); - int obj_id = 0; - for (size_t i = 0; i < names.size(); ++i) { - int local_id = q->get_local_id(i); - if (local_id != -1) { - objects[obj_id++] = local_id; - } - } - } - - dt = std::chrono::high_resolution_clock::now() - t; - LOG_NORMAL << " total objects: " << names.size() << endl - << " time [s]: " << chrono::duration(dt).count() << endl; + if (!console.init(argc, argv, params)) { + return 0; } - else { - throw std::runtime_error("Unable to open objects file"); - } - } - LOG_NORMAL << "Clustering (algorithm: " << Console::algo2str(console.algo) << ")... "; - vector assignments; - t = std::chrono::high_resolution_clock::now(); + std::unique_ptr graph = console.loadGraph(params); - double threshold = std::nexttoward(std::numeric_limits::max(), 0.0); - int n_clusters = (*clustering)(*dists, objects, threshold, assignments); - dt = std::chrono::high_resolution_clock::now() - t; - LOG_NORMAL << endl - << " objects: " << dists->num_objects() << ", clusters: " << n_clusters << endl - << " time [s]: " << chrono::duration(dt).count() << endl; - - LOG_NORMAL << "Saving clusters... "; - t = std::chrono::high_resolution_clock::now(); + console.loadObjects(params, *graph, objects, names); + console.doClustering(params, *graph, objects, assignments); + console.saveAssignments(params, *graph, names, assignments); - char sep = console.outputCSV ? ',' : '\t'; - - ofstream ofs(console.output); - int n_total_clusters = 0; - if (console.outputRepresentatives) { - n_total_clusters = dists->saveRepresentatives(ofs, names, assignments, sep); - } - else { - n_total_clusters = dists->saveAssignments(ofs, names, assignments, sep); - } - - dt = std::chrono::high_resolution_clock::now() - t; - LOG_NORMAL << endl - << " total clusters (including singletons): " << n_total_clusters << endl - << " time [s]: " << chrono::duration(dt).count() << endl; } catch (const std::string & message) { @@ -208,7 +43,7 @@ int main(int argc, char** argv) catch (const std::exception & exc) { LOG_NORMAL << exc.what() << std::endl; - return -1; + return -1; } catch (...) { diff --git a/src/params.cpp b/src/params.cpp new file mode 100644 index 0000000..15e3b8e --- /dev/null +++ b/src/params.cpp @@ -0,0 +1,117 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* + +#include "params.h" +#include "distances.h" +#include "log.h" +#include "leiden.h" + +#include +#include +#include + +using namespace std; + +void Params::printUsage() const { + LOG_NORMAL << "Usage:" << endl + << "clusty [options] " << endl << endl + << "Parameters:" << endl + << " - input TSV/CSV table with pairwise distances" << endl + << " - output TSV/CSV table with assignments" << endl << endl + << "Options:" << endl + << " " + PARAM_FILE_OBJECTS + " - optional TSV/CSV file with object names in the first column sorted decreasingly w.r.t. representativness" << endl + << " " + PARAM_ALGO + " - clustering algorithm:" << endl + << " * single - single linkage (connected component, MMseqs mode 1)" << endl + << " * complete - complete linkage" << endl + << " * uclust - UCLUST" << endl + << " * set-cover - greedy set cover (MMseqs mode 0)" << endl + << " * cd-hit - CD-HIT (greedy incremental; MMseqs mode 2)" << endl + << " * leiden - Leiden algorithm" << endl << endl + + << " " + PARAM_ID_COLUMNS + " - names of columns with sequence identifiers (default: two first columns)" << endl + << " " + PARAM_DISTANCE_COLUMN + " - name of the column with pairwise distances (or similarities; default: third column)" << endl + << " " + FLAG_SIMILARITY + " - use similarity instead of distances (has to be in [0,1] interval; default: false)" << endl + << " " + FLAG_PERCENT_SIMILARITY + " - use percent similarity (has to be in [0,100] interval; default: false)" << endl + << " " + PARAM_MIN + " - accept pairwise connections with values greater or equal given threshold in a specified column" << endl + << " " + PARAM_MAX + " - accept pairwise connections with values lower or equal given threshold in a specified column" << endl + << " " + FLAG_NUMERIC_IDS + " - use when sequences in the distances file are represented by numbers (can be mapped to string ids by the object file)" << endl + << " " + FLAG_OUT_REPRESENTATIVES + " - output a representative object for each cluster instead of a cluster numerical identifier (default: " << std::boolalpha << outputRepresentatives << ")" << endl + << " " + FLAG_OUT_CSV + " - output a CSV table instead of a default TSV (default: " << std::boolalpha << outputCSV << ")" + +#ifndef NO_LEIDEN + << endl << endl + << " " + PARAM_LEIDEN_RESOLUTION + " - resolution parameter for Leiden algorithm (default: " << leidenParams.resolution << ")" << endl + << " " + PARAM_LEIDEN_BETA + " - beta parameter for Leiden algorithm (default: " << leidenParams.beta << ")" << endl + << " " + PARAM_LEIDEN_RESOLUTION + " - number of interations for Leiden algorithm (default: " << leidenParams.numIterations << ")" +#endif + << endl << endl; +} + + +bool Params::parse(int argc, char** argv) { + + vector args; + for (int i = 1; i < argc; ++i) { + args.emplace_back(argv[i]); + } + + if (args.size() < 2) { + return false; + } + + std::string tmp; + findOption(args, PARAM_ALGO, tmp); + if (tmp.length()) { + algo = str2algo(tmp); + } + + findOption(args, PARAM_FILE_OBJECTS, objectsFile); + + findOption(args, PARAM_ID_COLUMNS, idColumns.first, idColumns.second); + numericIds = findSwitch(args, FLAG_NUMERIC_IDS); + + findOption(args, PARAM_DISTANCE_COLUMN, distanceColumn); + + bool use_similarity = findSwitch(args, FLAG_SIMILARITY); + bool use_percent_similarity = findSwitch(args, FLAG_PERCENT_SIMILARITY); + + if (use_percent_similarity) { + distanceSpecification = DistanceSpecification::PercentSimilarity; + } + else if (use_similarity) { + distanceSpecification = DistanceSpecification::Similarity; + } + + string column; + double value; + while (findOption(args, PARAM_MIN, column, value)) { + columns2filters[column].min = std::max(value, columns2filters[column].min); + } + while (findOption(args, PARAM_MAX, column, value)) { + columns2filters[column].max = std::min(value, columns2filters[column].max); + } + + outputRepresentatives = findSwitch(args, FLAG_OUT_REPRESENTATIVES); + outputCSV = findSwitch(args, FLAG_OUT_CSV); + + // leiden parameters + findOption(args, PARAM_LEIDEN_RESOLUTION, leidenParams.resolution); + findOption(args, PARAM_LEIDEN_BETA, leidenParams.beta); + findOption(args, PARAM_LEIDEN_ITERATIONS, leidenParams.numIterations); + + verbose = findSwitch(args, FLAG_VERBOSE); + + if (args.size() == 2) { + distancesFile = args[0]; + output = args[1]; + return true; + } + + return false; +} + diff --git a/src/params.h b/src/params.h new file mode 100644 index 0000000..e862112 --- /dev/null +++ b/src/params.h @@ -0,0 +1,151 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* +#pragma once +#include "graph.h" +#include "leiden.h" + +#include +#include +#include +#include + +enum class Algo { + SingleLinkage, + CompleteLinkage, + UClust, + SetCover, + CdHit, + Leiden +}; + +enum class DistanceSpecification { + Distance, + Similarity, + PercentSimilarity +}; + +class Params { + const std::string PARAM_ALGO{ "--algo" }; + + const std::string PARAM_FILE_OBJECTS{ "--objects-file" }; + + const std::string PARAM_ID_COLUMNS{ "--id-cols" }; + const std::string FLAG_NUMERIC_IDS{ "--numeric-ids" }; + const std::string PARAM_DISTANCE_COLUMN{ "--distance-col" }; + const std::string FLAG_SIMILARITY{ "--similarity" }; + const std::string FLAG_PERCENT_SIMILARITY{ "--percent-similarity" }; + + const std::string PARAM_MAX{ "--max" }; + const std::string PARAM_MIN{ "--min" }; + + const std::string FLAG_OUT_REPRESENTATIVES{ "--out-representatives" }; + const std::string FLAG_OUT_CSV{ "--out-csv" }; + + const std::string PARAM_LEIDEN_RESOLUTION{ "--leiden-resolution" }; + const std::string PARAM_LEIDEN_BETA{ "--leiden-beta" }; + const std::string PARAM_LEIDEN_ITERATIONS{ "--leiden-iterations" }; + + const std::string FLAG_VERBOSE{ "-v" }; + +public: + static Algo str2algo(const std::string& str) + { + if (str == "single") { return Algo::SingleLinkage; } + else if (str == "complete") { return Algo::CompleteLinkage; } + else if (str == "uclust") { return Algo::UClust; } + else if (str == "set-cover") { return Algo::SetCover; } + else if (str == "cd-hit") { return Algo::CdHit; } + else if (str == "leiden") { return Algo::Leiden; } + + else { throw std::runtime_error("Unkown clustering algorithm"); } + } + + static std::string algo2str(Algo algo) { + switch (algo) { + case Algo::SingleLinkage: return "single"; + case Algo::CompleteLinkage: return "complete"; + case Algo::UClust: return "uclust"; + case Algo::SetCover: return "set-cover"; + case Algo::Leiden: return "leiden"; + case Algo::CdHit: return "cd-hit"; + default: throw std::runtime_error("Unkown clustering algorithm"); + } + } + + +public: + + std::string objectsFile; + std::string distancesFile; + std::string output; + + Algo algo{ Algo::SingleLinkage }; + + std::pair idColumns; + bool numericIds{ false }; + + std::string distanceColumn; + DistanceSpecification distanceSpecification{ DistanceSpecification::Distance }; + + std::map columns2filters; + bool outputRepresentatives{ false }; + bool outputCSV{ false }; + + LeidenParams leidenParams; + + bool verbose{ false }; + + void printUsage() const; + bool parse(int argc, char** argv); + + bool findSwitch(std::vector& params, const std::string& name) { + auto it = find(params.begin(), params.end(), name); // verbose mode + if (it != params.end()) { + params.erase(it); + return true; + } + + return false; + } + + template + bool findOption(std::vector& params, const std::string& name, T& v) { + if (params.size() < 2) { + return false; + } + + auto stop = std::prev(params.end()); + auto it = find(params.begin(), stop, name); // verbose mode + if (it != stop) { + std::istringstream iss(*std::next(it)); + if (iss >> v) { + params.erase(it, it + 2); + return true; + } + } + return false; + } + + template + bool findOption(std::vector& params, const std::string& name, T& value1, U& value2) { + if (params.size() < 3) { + return false; + } + + auto stop = std::prev(params.end(), 2); + auto it = find(params.begin(), stop, name); // verbose mode + if (it != stop) { + if (std::istringstream(*std::next(it)) >> value1 + && std::istringstream(*std::next(it, 2)) >> value2) { + params.erase(it, it + 3); + return true; + } + } + return false; + } +}; diff --git a/src/set_cover.h b/src/set_cover.h index fe338cd..3a57b7b 100644 --- a/src/set_cover.h +++ b/src/set_cover.h @@ -18,12 +18,12 @@ /** MMseqs0 clustering algorithm */ -template -class SetCover : public IClustering +template +class SetCover : public IClustering { public: int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) override @@ -37,7 +37,7 @@ class SetCover : public IClustering for (int i = 0; i < nObjects; ++i) { int obj = objects[i]; obj2connections[i].first = obj; - obj2connections[i].second = distances.get_num_neighbours(obj); + obj2connections[i].second = distances.num_neighbours(obj); } std::stable_sort(obj2connections.begin(), obj2connections.end(), [](const auto& a, const auto& b) { return a.second > b.second; }); @@ -53,9 +53,9 @@ class SetCover : public IClustering assignments[obj] = cluster_number; // seed of a new cluster ... // ... and its neighbours: - for (const dist_t* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { - auto other = edge->u.s.hi; - if (edge->d <= threshold && assignments[other] == NO_ASSIGNMENT) { + for (const Distance* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { + auto other = edge->get_id(); + if (edge->get_d() <= threshold && assignments[other] == NO_ASSIGNMENT) { assignments[other] = cluster_number; } } diff --git a/src/single_bfs.h b/src/single_bfs.h index c7e8b94..b3aa32c 100644 --- a/src/single_bfs.h +++ b/src/single_bfs.h @@ -8,37 +8,25 @@ // // ******************************************************************************************* +#include "distances.h" +#include "clustering.h" + #include #include #include #include -#include "distances.h" -#include "clustering.h" - -#define debug(x) std::cerr << __FILE__ << " (" << __LINE__ << ") " << #x << " == " << (x) << std::endl - -std::ostream & operator<< (std::ostream & s, const std::deque & q) -{ - s << "[ "; - for (const auto n : q) - s << n << " "; - return s << "]"; -} - /** MMseqs1 clustering algorithm based on connected component search in an undirected graph */ -template -class SingleLinkageBFS : public IClustering +template +class SingleLinkageBFS : public IClustering { public: - /** @param[out] assignments The row is a complete clustering for a threshold. Each value in a row denotes a group an object has been assgned to. - @return number of elaborated clusters */ int operator() ( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, const double threshold, std::vector& assignments) override @@ -76,9 +64,9 @@ class SingleLinkageBFS : public IClustering if (assignments[node] == NO_ASSIGNMENT) { assignments[node] = cluster_number; // seed of a new cluster - for (const dist_t* edge = distances.begin(node); edge < distances.end(node); ++edge) { - auto other = edge->u.s.hi; - if (edge->d <= threshold && assignments[other] == NO_ASSIGNMENT) { + for (const Distance* edge = distances.begin(node); edge < distances.end(node); ++edge) { + auto other = edge->get_id(); + if (edge->get_d() <= threshold && assignments[other] == NO_ASSIGNMENT) { Q.push_back(other); } } diff --git a/src/sparse_matrix.h b/src/sparse_matrix.h new file mode 100644 index 0000000..b1952e5 --- /dev/null +++ b/src/sparse_matrix.h @@ -0,0 +1,39 @@ +// ******************************************************************************************* +// This file is a part of Clusty software distributed under GNU GPL 3 license. +// The homepage of the Clusty project is https://github.com/refresh-bio/Clusty +// +// Copyright(C) 2024-2024, A.Gudys, K.Siminski, S.Deorowicz +// +// ******************************************************************************************* +#pragma once +#include "distances.h" + +#include + + +/*********************************************************************************************************************/ +template +class SparseMatrix : public IMatrix +{ +public: + + size_t n_elements{ 0 }; + + std::vector> distances; + + SparseMatrix() {} + + virtual ~SparseMatrix() {} + + size_t num_objects() const { return distances.size(); } + + size_t num_elements() const { return n_elements; } + + size_t num_neighbours(int i) const { return distances[i].size(); } + + const Distance* begin(int row_id) const { return distances[row_id].data(); } + const Distance* end(int row_id) const { return distances[row_id].data() + distances[row_id].size(); } + + void clear_row(int row_id) { std::vector().swap(distances[row_id]); } +}; + diff --git a/src/uclust.h b/src/uclust.h index 563cda8..06a269c 100644 --- a/src/uclust.h +++ b/src/uclust.h @@ -17,12 +17,12 @@ #include #include -template -class UClust : public IClustering { +template +class UClust : public IClustering { public: int operator()( - const DistanceMatrix& distances, + SparseMatrix& distances, const std::vector& objects, double threshold, std::vector& assignments) override { @@ -41,20 +41,20 @@ class UClust : public IClustering { for (int i = 1; i < n_objects; ++i) { obj = objects[i]; - dist_t max_edge{ 0, dist_t::MAX }; - const dist_t* closest = &max_edge; + Distance max_edge; + const Distance* closest = &max_edge; // select closest seed among neighbours - for (const dist_t* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { + for (const Distance* edge = distances.begin(obj); edge < distances.end(obj); ++edge) { - if (seeds2clusters.count(edge->u.s.hi) && edge->d < closest->d) { + if (seeds2clusters.count(edge->get_id()) && edge->get_d() < closest->get_d()) { closest = edge; } } // if distance to seed is below threshold - if (closest->d <= threshold) { - assignments[obj] = seeds2clusters[closest->u.s.hi]; + if (closest->get_d() <= threshold) { + assignments[obj] = seeds2clusters[closest->get_id()]; } else { int cluster_id = seeds2clusters.size(); diff --git a/src/version.h b/src/version.h index 4c58389..9700723 100644 --- a/src/version.h +++ b/src/version.h @@ -8,12 +8,18 @@ // // ******************************************************************************************* -#define VERSION "1.0.3" -#define DATE "2024-09-12" +#define VERSION "1.1.0" +#define DATE "2024-09-30" /********* Version history ********* + 1.1.0 (2024-09-30) + Memory optimizatons: +* Row identifier removed from dist_t structure. +* Object identifiers in complete linkage stored as 32-bit integers. +* For some clustering algorithms only edges are stored (without distances). + 1.0.3 (2024-09-12) Fixes in building scripts.