Skip to content

Commit

Permalink
Memory optimizations
Browse files Browse the repository at this point in the history
* Row identifier removed from distance structure.
* Object identifiers in complete linkage stored as 32-bit integers.
* For some clustering algorithms only edges are stored (without distances).
  • Loading branch information
agudys authored Sep 30, 2024
1 parent 5d34c00 commit 6ac12c7
Show file tree
Hide file tree
Showing 24 changed files with 1,713 additions and 1,486 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
/libs/winflexbison
/src/clusty/x64/Debug/clusty.tlog
/src/clusty/x64/Debug
/src
/src/clusty/

/src/clusty.vcxproj.user
3 changes: 2 additions & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ OBJS := \
$(MIMALLOC_OBJ) \
$(MAIN_DIR)/console.o \
$(MAIN_DIR)/conversion.o \
$(MAIN_DIR)/distances.o \
$(MAIN_DIR)/graph.o \
$(MAIN_DIR)/log.o \
$(MAIN_DIR)/main.o \
$(MAIN_DIR)/params.o \

%.o: %.cpp igraph
$(CXX) $(CFLAGS) -c $< -o $@
Expand Down
12 changes: 6 additions & 6 deletions src/cd_hit.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
#include <vector>
#include <unordered_map>

template <class DistanceMatrix>
class CdHit : public IClustering<DistanceMatrix> {
template <class Distance>
class CdHit : public IClustering<Distance> {
public:

int operator()(
const DistanceMatrix& distances,
SparseMatrix<Distance>& distances,
const std::vector<int>& objects,
double threshold,
std::vector<int>& assignments) override {
Expand All @@ -41,10 +41,10 @@ class CdHit : public IClustering<DistanceMatrix> {
assignments[obj] = cluster_id;

// iterate over connected object and assign those which are unassigned
for (const dist_t* edge = distances.begin(obj); edge < distances.end(obj); ++edge) {
int other = edge->u.s.hi;
for (const Distance* edge = distances.begin(obj); edge < distances.end(obj); ++edge) {
int other = edge->get_id();

if (edge->d <= threshold && assignments[other] == -1) {
if (edge->get_d() <= threshold && assignments[other] == -1) {
assignments[other] = cluster_id;
}
}
Expand Down
17 changes: 9 additions & 8 deletions src/clustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@

#include <vector>
#include <numeric>
#include <algorithm>

#include "distances.h"
#include "sparse_matrix.h"

struct node_t {
int first = -1;
Expand All @@ -23,12 +24,12 @@ struct node_t {
: first(first), second(second), distance(distance) {}
};

template <class DistanceMatrix>
template <class Distance>
class IClustering {
public:

virtual int operator()(
const DistanceMatrix& distances,
SparseMatrix<Distance>& distances,
const std::vector<int>& objects,
double threshold,
std::vector<int>& assignments) = 0;
Expand All @@ -37,12 +38,12 @@ class IClustering {

};

template <class DistanceMatrix>
class HierarchicalClustering : public IClustering<DistanceMatrix> {
template <class Distance>
class HierarchicalClustering : public IClustering<Distance> {
protected:

void makeDendrogram(
const std::vector<dist_t>& lambda,
const std::vector<Distance>& lambda,
const std::vector<int>& pi,
std::vector<node_t>& dendrogram)
{
Expand All @@ -51,7 +52,7 @@ class HierarchicalClustering : public IClustering<DistanceMatrix> {
std::vector<int> elements(n_objects - 1);
std::iota(elements.begin(), elements.end(), 0);

stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) {
std::stable_sort(elements.begin(), elements.end(), [&lambda](int x, int y) {
return lambda[x] < lambda[y];
});

Expand All @@ -64,7 +65,7 @@ class HierarchicalClustering : public IClustering<DistanceMatrix> {
for (int i = 0; i < n_objects - 1; ++i) {
int j = elements[i];
int next = pi[j];
dendrogram.emplace_back(index[j], index[next], lambda[j].d);
dendrogram.emplace_back(index[j], index[next], lambda[j].get_d());
index[next] = n_objects + i;
}
}
Expand Down
12 changes: 9 additions & 3 deletions src/clusty.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<BufferSecurityCheck>false</BufferSecurityCheck>
<LanguageStandard>stdcpp17</LanguageStandard>
<LanguageStandard>stdcpplatest</LanguageStandard>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
</ClCompile>
<Link>
Expand All @@ -151,7 +151,7 @@
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<ConformanceMode>true</ConformanceMode>
<BufferSecurityCheck>false</BufferSecurityCheck>
<LanguageStandard>stdcpp17</LanguageStandard>
<LanguageStandard>stdcpplatest</LanguageStandard>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
Expand All @@ -165,9 +165,10 @@
<ClCompile Include="..\libs\mimalloc\src\static.c" />
<ClCompile Include="console.cpp" />
<ClCompile Include="conversion.cpp" />
<ClCompile Include="distances.cpp" />
<ClCompile Include="graph.cpp" />
<ClCompile Include="log.cpp" />
<ClCompile Include="main.cpp" />
<ClCompile Include="params.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="cd_hit.h" />
Expand All @@ -179,8 +180,13 @@
<ClInclude Include="log.h" />
<ClInclude Include="memory_monotonic.h" />
<ClInclude Include="distances.h" />
<ClInclude Include="graph.h" />
<ClInclude Include="params.h" />
<ClInclude Include="set_cover.h" />
<ClInclude Include="single_bfs.h" />
<ClInclude Include="sparse_matrix.h" />
<ClInclude Include="graph_named.h" />
<ClInclude Include="graph_numbered.h" />
<ClInclude Include="uclust.h" />
<ClInclude Include="utils.h" />
<ClInclude Include="version.h" />
Expand Down
8 changes: 7 additions & 1 deletion src/clusty.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
<ClCompile Include="main.cpp" />
<ClCompile Include="console.cpp" />
<ClCompile Include="conversion.cpp" />
<ClCompile Include="distances.cpp" />
<ClCompile Include="graph.cpp" />
<ClCompile Include="..\libs\mimalloc\src\static.c">
<Filter>Library Files</Filter>
</ClCompile>
<ClCompile Include="log.cpp" />
<ClCompile Include="params.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="distances.h" />
Expand All @@ -34,5 +35,10 @@
<ClInclude Include="set_cover.h" />
<ClInclude Include="single_bfs.h" />
<ClInclude Include="log.h" />
<ClInclude Include="graph_named.h" />
<ClInclude Include="graph_numbered.h" />
<ClInclude Include="sparse_matrix.h" />
<ClInclude Include="graph.h" />
<ClInclude Include="params.h" />
</ItemGroup>
</Project>
Loading

0 comments on commit 6ac12c7

Please sign in to comment.