diff --git a/src/position.hpp b/src/position.hpp index 9aa1ac4d..69078f8c 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -88,6 +88,15 @@ struct path_range_t { std::string data; }; +struct path_range_comparator { + bool operator() (const path_range_t& lhs, const path_range_t& rhs) const { + if (lhs.begin.path != rhs.begin.path) return lhs.begin.path < rhs.begin.path; + if (lhs.end.path != rhs.end.path) return lhs.end.path < rhs.end.path; + if (lhs.begin.offset != rhs.begin.offset) return lhs.begin.offset < rhs.begin.offset; + return lhs.end.offset < rhs.end.offset; + } +}; + inline std::string& get_long_path_name(std::tuple path_long_start_end) { return std::get<0>(path_long_start_end); } diff --git a/src/subcommand/extract_main.cpp b/src/subcommand/extract_main.cpp index ce0ecc02..e900a28d 100644 --- a/src/subcommand/extract_main.cpp +++ b/src/subcommand/extract_main.cpp @@ -556,6 +556,17 @@ namespace odgi { return std::binary_search(source_paths_from_path_ranges.begin(), source_paths_from_path_ranges.end(), x); }), source_paths->end()); + // We don't cut nodes for the extraction, so close path intervals can generate identical subpaths. + // To avoid duplicated subpaths in the final subgraph, we remove duplicated path ranges. + { + std::set unique_path_ranges; + + for (const auto& path_range : path_ranges) { + unique_path_ranges.insert(path_range); + } + + path_ranges.assign(unique_path_ranges.begin(), unique_path_ranges.end()); + } if (max_dist_subpaths > 0) { // Iterate multiple times to merge subpaths which became mergeable during the first iteration where new nodes were added