Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A sampled step index implementation of the PG-SGD algorithm #454

Draft
wants to merge 31 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
68d4f11
prepare basic functions for ssi integration
subwaystation Oct 17, 2022
c00d7f3
restructering
subwaystation Oct 19, 2022
7e388e6
forgetti spaghetti
subwaystation Oct 19, 2022
9a9aa7b
refactory
subwaystation Oct 25, 2022
b763c10
missing EOL
subwaystation Oct 25, 2022
4659799
fix the FIXMEs is the plan
subwaystation Oct 25, 2022
4b5f889
re-volt
subwaystation Oct 25, 2022
4efce0f
Merge branch 'master' into pg_sgd_ssi
subwaystation Oct 25, 2022
35f09b1
SSI is the default route now
subwaystation Oct 25, 2022
1804cd2
Merge branch 'pg_sgd_ssi' of github.com:pangenome/odgi into pg_sgd_ssi
subwaystation Oct 25, 2022
3575a2e
we can clean the ssi
subwaystation Oct 27, 2022
2b7896a
CLEANERS
subwaystation Oct 27, 2022
3fbd887
ugly skeletor
subwaystation Oct 27, 2022
985f132
CMAKER
subwaystation Oct 27, 2022
8e27d5c
node-step-bit-vector initialized
subwaystation Oct 27, 2022
af6db6f
ns_bv in place
subwaystation Oct 28, 2022
cf9631a
bug fix
subwaystation Oct 28, 2022
fad31c7
happy
subwaystation Oct 28, 2022
588800c
pos of 1st step is retrieved via ssi!
subwaystation Oct 28, 2022
aea0b91
pos of 2nd step via ssi when we do uniform sampling
subwaystation Oct 28, 2022
e583885
some more tweaks
subwaystation Oct 31, 2022
64d6dfd
hmm
subwaystation Oct 31, 2022
2cd5023
working somehow
subwaystation Oct 31, 2022
7e192b3
cleanup
subwaystation Oct 31, 2022
c824efe
playing around
subwaystation Nov 2, 2022
7327513
better mirror initial implementation
subwaystation Nov 2, 2022
b675cad
cleanup path_linear_sgd_schedule
subwaystation Nov 2, 2022
91c9f34
refactor prepare_weak_connected_components_map
subwaystation Nov 3, 2022
925683a
refactor generate_and_write_snapshot_graphs
subwaystation Nov 3, 2022
2ae8a19
refactor from_layout_to_node_order
subwaystation Nov 3, 2022
b5afae3
take a look in the mirror
subwaystation Nov 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
prepare basic functions for ssi integration
subwaystation committed Oct 17, 2022
commit 68d4f11eac06b2c0069b339fb5b4c3c2e70f3684
4 changes: 4 additions & 0 deletions src/algorithms/stepindex.cpp
Original file line number Diff line number Diff line change
@@ -129,6 +129,10 @@ void step_index_t::load(const std::string& name) {
step_mphf->load(stpidx_in);
}

const uint64_t step_index_t::get_sample_rate() {
return this->sample_rate;
}

void step_index_t::serialize_members(std::ostream &out) const {
serialize_and_measure(out);
}
1 change: 1 addition & 0 deletions src/algorithms/stepindex.hpp
Original file line number Diff line number Diff line change
@@ -63,6 +63,7 @@ struct step_index_t {
const uint64_t get_path_len(const path_handle_t& path) const;
void save(const std::string& name) const;
void load(const std::string& name);
const uint64_t get_sample_rate();
// map from step to position in its path
boophf_step_t* step_mphf = nullptr;
sdsl::int_vector<64> pos;
32 changes: 22 additions & 10 deletions src/subcommand/sort_main.cpp
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@
#include "algorithms/xp.hpp"
#include "algorithms/path_sgd.hpp"
#include "algorithms/groom.hpp"
#include "algorithms/stepindex.hpp"

namespace odgi {

@@ -38,6 +39,7 @@ int main_sort(int argc, char** argv) {
" ending with *.og* is recommended.", {'o', "out"});
args::Group files_io_opts(parser, "[ Files IO Options ]");
args::ValueFlag<std::string> xp_in_file(files_io_opts, "FILE", "Load the succinct variation graph index from this *FILE*. The file name usually ends with *.xp*.", {'X', "path-index"});
args::ValueFlag<std::string> ssi_in_file(files_io_opts, "FILE", "Load the sampled step index from this *FILE*. The file name usually ends with *.ssi*.", {'e', "sampled-step-index"});
args::ValueFlag<std::string> sort_order_in(files_io_opts, "FILE", "*FILE* containing the sort order. Each line contains one node identifer.", {'s', "sort-order"});
args::ValueFlag<std::string> tmp_base(files_io_opts, "PATH", "directory for temporary files", {'C', "temp-dir"});
args::Group topo_sorts_opts(parser, "[ Topological Sort Options ]");
@@ -171,25 +173,25 @@ int main_sort(int argc, char** argv) {
/// path guided linear 1D SGD sort helpers
// TODO beautify this, maybe put into its own file
std::function<uint64_t(const std::vector<path_handle_t> &,
const xp::XP &)> get_sum_path_step_count
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, const xp::XP &path_index) {
graph_t &)> get_sum_path_step_count
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, graph_t &graph) {
uint64_t sum_path_step_count = 0;
for (auto& path : path_sgd_use_paths) {
sum_path_step_count += path_index.get_path_step_count(path);
sum_path_step_count += graph.get_step_count(path);
}
return sum_path_step_count;
};
std::function<uint64_t(const std::vector<path_handle_t> &,
const xp::XP &)> get_max_path_step_count
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, const xp::XP &path_index) {
graph_t &)> get_max_path_step_count
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, graph_t &graph) {
uint64_t max_path_step_count = 0;
for (auto& path : path_sgd_use_paths) {
max_path_step_count = std::max(max_path_step_count, path_index.get_path_step_count(path));
max_path_step_count = std::max(max_path_step_count, graph.get_step_count(path));
}
return max_path_step_count;
};
std::function<uint64_t(const std::vector<path_handle_t> &,
const xp::XP &)> get_max_path_length
const xp::XP &)> get_max_path_length_xp
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, const xp::XP &path_index) {
uint64_t max_path_length = std::numeric_limits<uint64_t>::min();
for (auto &path : path_sgd_use_paths) {
@@ -198,6 +200,16 @@ int main_sort(int argc, char** argv) {
return max_path_length;
};

std::function<uint64_t(const std::vector<path_handle_t> &,
const algorithms::step_index_t &)> get_max_path_length_ssi
= [&](const std::vector<path_handle_t> &path_sgd_use_paths, const algorithms::step_index_t &sampled_step_index) {
uint64_t max_path_length = std::numeric_limits<uint64_t>::min();
for (auto &path : path_sgd_use_paths) {
max_path_length = std::max(max_path_length, sampled_step_index.get_path_len(path));
}
return max_path_length;
};

// default parameters
std::string path_sgd_seed;
if (p_sgd_seed) {
@@ -364,7 +376,7 @@ int main_sort(int argc, char** argv) {
path_sgd_use_paths.push_back(path);
});
}
uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, path_index);
uint64_t sum_path_step_count = get_sum_path_step_count(path_sgd_use_paths, graph);
if (args::get(p_sgd_min_term_updates_paths)) {
path_sgd_min_term_updates = args::get(p_sgd_min_term_updates_paths) * sum_path_step_count;
} else {
@@ -374,8 +386,8 @@ int main_sort(int argc, char** argv) {
path_sgd_min_term_updates = 1.0 * sum_path_step_count;
}
}
uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, path_index);
path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : get_max_path_length(path_sgd_use_paths, path_index);
uint64_t max_path_step_count = get_max_path_step_count(path_sgd_use_paths, graph);
path_sgd_zipf_space = args::get(p_sgd_zipf_space) ? args::get(p_sgd_zipf_space) : get_max_path_length_xp(path_sgd_use_paths, path_index);
path_sgd_zipf_space_max = args::get(p_sgd_zipf_space_max) ? args::get(p_sgd_zipf_space_max) : 100;

path_sgd_zipf_max_number_of_distributions = args::get(p_sgd_zipf_max_number_of_distributions) ? std::max(
101 changes: 8 additions & 93 deletions src/unittest/stepindex.cpp
Original file line number Diff line number Diff line change
@@ -79,6 +79,7 @@ namespace odgi {

SECTION("The index delivers the correct positions for a given step. Sample rate: 1.") {
step_index_t step_index_1(graph, paths, 1, false, 1);
REQUIRE(step_index_1.get_sample_rate() == 1);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
@@ -188,6 +189,7 @@ namespace odgi {

SECTION("The index delivers the correct positions for a given step. Sample rate: 2.") {
step_index_t step_index_2(graph, paths, 1, false, 2);
REQUIRE(step_index_2.get_sample_rate() == 2);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
@@ -281,99 +283,7 @@ namespace odgi {

SECTION("The index delivers the correct positions for a given step. Sample rate: 4.") {
step_index_t step_index_4(graph, paths, 1, false, 4);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
uint64_t cur_step_rank = 0;
graph.for_each_step_in_path(path, [&](const step_handle_t& occ) {
switch(cur_step_rank) {
case 0:
REQUIRE(step_index_4.get_position(occ, graph) == 0);
break;
case 1:
REQUIRE(step_index_4.get_position(occ, graph) == 1);
break;
case 2:
REQUIRE(step_index_4.get_position(occ, graph) == 2);
break;
case 3:
REQUIRE(step_index_4.get_position(occ, graph) == 5);
break;
case 4:
REQUIRE(step_index_4.get_position(occ, graph) == 8);
break;
case 5:
REQUIRE(step_index_4.get_position(occ, graph) == 11);
break;
}
cur_step_rank++;
});
}

if (cur_path == "query1") {
uint64_t cur_step_rank = 0;
graph.for_each_step_in_path(path, [&](const step_handle_t& occ) {
switch(cur_step_rank) {
case 0:
REQUIRE(step_index_4.get_position(occ, graph) == 0);
break;
case 1:
REQUIRE(step_index_4.get_position(occ, graph) == 1);
break;
case 2:
REQUIRE(step_index_4.get_position(occ, graph) == 2);
break;
case 3:
REQUIRE(step_index_4.get_position(occ, graph) == 5);
break;
}
cur_step_rank++;
});
}

if (cur_path == "query2") {
uint64_t cur_step_rank = 0;
graph.for_each_step_in_path(path, [&](const step_handle_t& occ) {
switch(cur_step_rank) {
case 0:
REQUIRE(step_index_4.get_position(occ, graph) == 0);
break;
}
cur_step_rank++;
});
}

if (cur_path == "query3") {
uint64_t cur_step_rank = 0;
graph.for_each_step_in_path(path, [&](const step_handle_t& occ) {
switch(cur_step_rank) {
case 0:
REQUIRE(step_index_4.get_position(occ, graph) == 0);
break;
case 1:
REQUIRE(step_index_4.get_position(occ, graph) == 3);
break;
case 2:
REQUIRE(step_index_4.get_position(occ, graph) == 4);
break;
case 3:
REQUIRE(step_index_4.get_position(occ, graph) == 5);
break;
case 4:
REQUIRE(step_index_4.get_position(occ, graph) == 8);
break;
case 5:
REQUIRE(step_index_4.get_position(occ, graph) == 11);
break;
}
cur_step_rank++;
});
}
});
}

SECTION("The index delivers the correct positions for a given step. Sample rate: 4.") {
step_index_t step_index_4(graph, paths, 1, false, 4);
REQUIRE(step_index_4.get_sample_rate() == 4);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
@@ -467,6 +377,7 @@ namespace odgi {

SECTION("The index delivers the correct positions for a given step. Sample rate: 8.") {
step_index_t step_index_8(graph, paths, 1, false, 8);
REQUIRE(step_index_8.get_sample_rate() == 8);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
@@ -560,6 +471,7 @@ namespace odgi {

SECTION("The index delivers the correct positions for a given step. Sample rate: 16.") {
step_index_t step_index_16(graph, paths, 1, false, 16);
REQUIRE(step_index_16.get_sample_rate() == 16);
graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {
@@ -660,6 +572,9 @@ namespace odgi {
step_index_t step_index_loaded;
step_index_loaded.load(basename + "unittest.stpidx");

REQUIRE(step_index_to_save.get_sample_rate() == 8);
REQUIRE(step_index_loaded.get_sample_rate() == 8);

graph.for_each_path_handle([&](const path_handle_t path) {
std::string cur_path = graph.get_path_name(path);
if (cur_path == "target") {