diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 0a56b03..33fc4da 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -34,7 +34,8 @@ add_executable(UnitTests tests/UnitTests.cpp tests/SequenceUtilsTest.cpp tests/PurityScoreTest.cpp - tests/GenomicRegionTest.cpp) + tests/GenomicRegionTest.cpp + tests/IrrFinderTest.cpp) target_link_libraries(UnitTests common reads region) target_include_directories(UnitTests PUBLIC ${CMAKE_SOURCE_DIR}) diff --git a/source/app/ExpansionHunterDenovo.cpp b/source/app/ExpansionHunterDenovo.cpp index ae9e6cc..53155c6 100644 --- a/source/app/ExpansionHunterDenovo.cpp +++ b/source/app/ExpansionHunterDenovo.cpp @@ -128,9 +128,9 @@ int runProfileWorkflow(int argc, char** argv) spdlog::info("Starting {} profile workflow", kProgramVersion); + Interval motifSizeRange(shortestUnitToConsider, longestUnitToConsider); ProfileWorkflowParameters params( - outputPrefix, pathToReads, pathToReference, shortestUnitToConsider, longestUnitToConsider, minMapqOfAnchorRead, - maxMapqOfInrepeatRead); + outputPrefix, pathToReads, pathToReference, motifSizeRange, minMapqOfAnchorRead, maxMapqOfInrepeatRead); return runProfileWorkflow(params); } diff --git a/source/app/Version.hh b/source/app/Version.hh index 9fe18a9..389d916 100644 --- a/source/app/Version.hh +++ b/source/app/Version.hh @@ -24,4 +24,5 @@ #include -const std::string kProgramVersion = "ExpansionHunter Denovo v0.8.1"; +const std::string kProgramVersion = "ExpansionHunter Denovo v0.8.6"; + diff --git a/source/classification/CMakeLists.txt b/source/classification/CMakeLists.txt deleted file mode 100644 index f5d669f..0000000 --- a/source/classification/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_library(classification irr_finder.cc classifiers.cc) -target_link_libraries(classification purity common) -add_subdirectory(unit_tests) \ No newline at end of file diff --git a/source/classification/unit_tests/CMakeLists.txt b/source/classification/unit_tests/CMakeLists.txt deleted file mode 100644 index a832849..0000000 --- a/source/classification/unit_tests/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_executable(irr_finder_test irr_finder_test.cc) -target_link_libraries(irr_finder_test classification gtest gmock_main) -add_test(NAME irr_finder_test COMMAND irr_finder_test) - -add_executable(classifier_test classifier_test.cc) -target_link_libraries(classifier_test classification gtest_main) -add_test(NAME classifier_test COMMAND classifier_test) \ No newline at end of file diff --git a/source/classification/unit_tests/classifier_test.cc b/source/classification/unit_tests/classifier_test.cc deleted file mode 100644 index 248a93e..0000000 --- a/source/classification/unit_tests/classifier_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -// -// ExpansionHunter Denovo -// Copyright 2016-2019 Illumina, Inc. -// All rights reserved. -// -// Author: Egor Dolzhenko , -// Michael Eberle -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// - -#include "classification/classifiers.cc" - -#include "gtest/gtest.h" - -using std::string; - -TEST(ReadClassification, IrrThatIsNotAnchor_ClassifiedIrr) -{ - const int max_irr_mapq = 10; - const int min_anchor_mapq = 50; - Read read; - read.bases = "CCCCC"; - read.quals = "$$$$$"; - read.mapq = 0; - string unit; - ASSERT_EQ(ReadType::kIrrRead, ClassifyRead(max_irr_mapq, min_anchor_mapq, read, unit)); -} - -TEST(ReadClassification, IrrThatIsAnchor_ClassifiedIrr) -{ - const int max_irr_mapq = 60; - const int min_anchor_mapq = 50; - Read read; - read.bases = "CCCCC"; - read.quals = "$$$$$"; - read.mapq = 60; - string unit; - ASSERT_EQ(ReadType::kIrrRead, ClassifyRead(max_irr_mapq, min_anchor_mapq, read, unit)); -} - -TEST(ReadClassification, AnchorThatIsNotIrr_ClassifiedAnchor) -{ - const int max_irr_mapq = 10; - const int min_anchor_mapq = 50; - Read read; - read.bases = "ATCGC"; - read.quals = "$$$$$"; - read.mapq = 60; - string unit; - ASSERT_EQ(ReadType::kAnchorRead, ClassifyRead(max_irr_mapq, min_anchor_mapq, read, unit)); -} - -TEST(ReadClassification, NotAnchorAndNotIrr_ClassifiedOther) -{ - const int max_irr_mapq = 10; - const int min_anchor_mapq = 50; - Read read; - read.bases = "ATCGC"; - read.quals = "$$$$$"; - read.mapq = 0; - string unit; - ASSERT_EQ(ReadType::kOtherRead, ClassifyRead(max_irr_mapq, min_anchor_mapq, read, unit)); -} diff --git a/source/classification/unit_tests/irr_finder_test.cc b/source/classification/unit_tests/irr_finder_test.cc deleted file mode 100644 index 7334c77..0000000 --- a/source/classification/unit_tests/irr_finder_test.cc +++ /dev/null @@ -1,208 +0,0 @@ -// -// ExpansionHunter Denovo -// Copyright 2016-2019 Illumina, Inc. -// All rights reserved. -// -// Author: Egor Dolzhenko , -// Michael Eberle -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// - -#include "classification/irr_finder.h" - -#include -#include - -#include "gmock/gmock.h" - -using std::string; -using std::vector; -using testing::DoubleNear; - -TEST(MaxMatches, CalculatedForVariousOffsets) -{ - const string bases = "ATCGATCG"; - const size_t num_bases = bases.length(); - EXPECT_EQ(num_bases, MaxMatchesAtOffset(0, bases)); - EXPECT_EQ(num_bases - 1, MaxMatchesAtOffset(1, bases)); - EXPECT_EQ(num_bases - 2, MaxMatchesAtOffset(2, bases)); - EXPECT_EQ(0, MaxMatchesAtOffset(num_bases, bases)); - EXPECT_EQ(0, MaxMatchesAtOffset(num_bases + 1, bases)); -} - -TEST(MatchFrequency, CalculatedForValidOffsets) -{ - const string bases = "GGCCCCGGCCCC"; - vector expected_frequencies = { 0.73, 0.40, 0.33, 0.25, 0.57, 1.00 }; - - for (int32_t offset = 1; offset != 7; ++offset) - { - const double expected_frequency = expected_frequencies[offset - 1]; - EXPECT_THAT(MatchFrequencyAtOffset(offset, bases), DoubleNear(expected_frequency, 0.005)); - } -} - -TEST(MatchFrequency, CalculatedForImperfectRepeat) -{ - const string bases = "ATGATCATGTTGATG"; - const double expected_frequency = 8 / 12.0; - EXPECT_THAT(MatchFrequencyAtOffset(3, bases), DoubleNear(expected_frequency, 0.005)); -} - -TEST(MatchFrequency, ExceptionThrownForInvalidOffsets) -{ - const string bases = "GGCCCCGGCCCC"; - const int32_t num_bases = bases.length(); - EXPECT_ANY_THROW(MatchFrequencyAtOffset(-1, bases)); - EXPECT_ANY_THROW(MatchFrequencyAtOffset(0, bases)); - EXPECT_ANY_THROW(MatchFrequencyAtOffset(num_bases / 2 + 1, bases)); -} - -TEST(SmallestFrequentPeriod, CalculatedForTypicalSequences) -{ - { - const double min_frequency = 0.85; - const string bases = "GGCCCCGGCCCC"; - const int32_t period = SmallestFrequentPeriod(min_frequency, bases); - EXPECT_EQ(6, period); - } - - { - const double min_frequency = 0.85; - const string bases = "ATGATCATGATGATGATGATG"; - const int32_t period = SmallestFrequentPeriod(min_frequency, bases); - EXPECT_EQ(3, period); - } -} - -TEST(SmallestFrequentPeriod, ReturnsNegativeOneIfNoFrequentPeriodExists) -{ - const double min_frequency = 0.85; - const string bases = "ATCGGCTA"; - const int32_t period = SmallestFrequentPeriod(min_frequency, bases); - EXPECT_EQ(-1, period); -} - -TEST(ConsensusBase, ExtractedForTypicalSequence) -{ - const string bases = "CGATGACTG"; - const int32_t period = 3; - EXPECT_EQ('C', ExtractConsensusBase(0, period, bases)); - EXPECT_EQ('G', ExtractConsensusBase(1, period, bases)); - EXPECT_EQ('A', ExtractConsensusBase(2, period, bases)); -} - -TEST(ConsensusRepeatUnit, ExtractedForTypicalSequences) -{ - { - const string bases = "CGGCGGCGG"; - const int32_t period = 3; - EXPECT_EQ("CGG", ExtractConsensusRepeatUnit(period, bases)); - } - { - const string bases = "CGGATTATTATTCGG"; - const int32_t period = 3; - EXPECT_EQ("ATT", ExtractConsensusRepeatUnit(period, bases)); - } -} - -TEST(MinimialUnitUnderShift, ComputedForTypicalUnits) -{ - const string repeat_unit = "GGC"; - EXPECT_EQ("CGG", MinimialUnitUnderShift(repeat_unit)); -} - -TEST(CanonicalUnit, ComputedForTypicalUnits) -{ - { - const string repeat_unit = "CGG"; - EXPECT_EQ("CCG", ComputeCanonicalRepeatUnit(repeat_unit)); - } - { - const string repeat_unit = "GCC"; - EXPECT_EQ("CCG", ComputeCanonicalRepeatUnit(repeat_unit)); - } -} - -TEST(ComputeCanonicalRepeatUnitForRead, TypicalInrepeatReadsDetected) -{ - { - const string bases = "CGGCGCCGGCGG"; - const double min_frequency = 0.8; - EXPECT_EQ("CCG", ComputeCanonicalRepeatUnit(min_frequency, bases)); - EXPECT_EQ("", ComputeCanonicalRepeatUnit(min_frequency + 0.05, bases)); - } - - { - const string bases = "ACCCCAACCCCAACCCCAACCCCAACCCCAACCCCA"; - const double min_frequency = 0.8; - EXPECT_EQ("AACCCC", ComputeCanonicalRepeatUnit(min_frequency, bases)); - } -} - -TEST(ComputeCanonicalRepeatUnitForRead, HomopolymerReadsDetected) -{ - const string bases = "CCCCCCC"; - const double min_frequency = 1.0; - EXPECT_EQ("C", ComputeCanonicalRepeatUnit(min_frequency, bases)); -} - -TEST(IsInrepeatRead, DeterminedForTypicalReads) -{ - { - const string irr = "CCCCC"; - const string quals = "$$$$$"; - string unit; - EXPECT_TRUE(IsInrepeatRead(irr, quals, unit)); - EXPECT_EQ("C", unit); - } - { - const string not_irr = "AAAAACCCCC"; - const string quals = "$$$$$$$$$$"; - string unit; - EXPECT_FALSE(IsInrepeatRead(not_irr, quals, unit)); - } - { - const string irr = "TCCACCCACCTCACCCCCCCCCCCCCCCGCCCCCCCCCCACCCCCCCCGCCCCCCCCCCCGGCCCCCCACTCCCCCCCCCCGGTCCTCCCC" - "CCCCCCCACCCTCCCCCCC" - "CGCCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCC"; - const string quals = "------7----7-----7-777-7-F<--777F777F -#include - -using std::string; -using std::vector; +#include "common/Interval.hh" diff --git a/source/common/Interval.hh b/source/common/Interval.hh new file mode 100644 index 0000000..15143e7 --- /dev/null +++ b/source/common/Interval.hh @@ -0,0 +1,49 @@ +// +// ExpansionHunter Denovo +// Copyright 2016-2019 Illumina, Inc. +// All rights reserved. +// +// Author: Egor Dolzhenko , +// Michael Eberle +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// + +#pragma once + +#include +#include + +class Interval +{ +public: + Interval(int start, int end) + : start_(start) + , end_(end) + { + if (start_ > end_) + { + const auto interval = "(" + std::to_string(start_) + ", " + std::to_string(end_) + ")"; + throw std::runtime_error("Invalid interval endpoints " + interval); + } + } + + int start() const { return start_; } + int end() const { return end_; } + bool contains(int value) const { return start_ <= value && value <= end_; } + +private: + int start_; + int end_; +}; diff --git a/source/profile/ProfileParameters.cpp b/source/profile/ProfileParameters.cpp index 495afeb..a0e3612 100644 --- a/source/profile/ProfileParameters.cpp +++ b/source/profile/ProfileParameters.cpp @@ -31,13 +31,12 @@ namespace fs = boost::filesystem; using std::string; ProfileWorkflowParameters::ProfileWorkflowParameters( - const string& outputPrefix, string pathToReads, string pathToReference, int shortestUnitToConsider, - int longestUnitToConsider, int minMapqOfAnchorRead, int maxMapqOfInrepeatRead) + const string& outputPrefix, string pathToReads, string pathToReference, Interval motifSizeRange, + int minMapqOfAnchorRead, int maxMapqOfInrepeatRead) : profilePath_(outputPrefix + ".str_profile.json") , pathToReads_(std::move(pathToReads)) , pathToReference_(std::move(pathToReference)) - , shortestUnitToConsider_(shortestUnitToConsider) - , longestUnitToConsider_(longestUnitToConsider) + , motifSizeRange_(std::move(motifSizeRange)) , minMapqOfAnchorRead_(minMapqOfAnchorRead) , maxMapqOfInrepeatRead_(maxMapqOfInrepeatRead) { diff --git a/source/profile/ProfileParameters.hh b/source/profile/ProfileParameters.hh index 8fd3a07..0e46138 100644 --- a/source/profile/ProfileParameters.hh +++ b/source/profile/ProfileParameters.hh @@ -24,18 +24,19 @@ #include +#include "common/Interval.hh" + class ProfileWorkflowParameters { public: ProfileWorkflowParameters( - const std::string& outputPrefix, std::string pathToReads, std::string pathToReference, - int shortestUnitToConsider, int longestUnitToConsider, int minMapqOfAnchorRead, int maxMapqOfInrepeatRead); + const std::string& outputPrefix, std::string pathToReads, std::string pathToReference, Interval motifSizeRange, + int minMapqOfAnchorRead, int maxMapqOfInrepeatRead); const std::string& profilePath() const { return profilePath_; } const std::string& pathToReads() const { return pathToReads_; } const std::string& pathToReference() const { return pathToReference_; } - int shortestUnitToConsider() const { return shortestUnitToConsider_; } - int longestUnitToConsider() const { return longestUnitToConsider_; } + const Interval& motifSizeRange() const { return motifSizeRange_; } int minMapqOfAnchorRead() const { return minMapqOfAnchorRead_; } int maxMapqOfInrepeatRead() const { return maxMapqOfInrepeatRead_; } @@ -43,8 +44,7 @@ private: std::string profilePath_; std::string pathToReads_; std::string pathToReference_; - int shortestUnitToConsider_; - int longestUnitToConsider_; + Interval motifSizeRange_; int minMapqOfAnchorRead_; int maxMapqOfInrepeatRead_; }; diff --git a/source/profile/ProfileWorkflow.cpp b/source/profile/ProfileWorkflow.cpp index a866539..94778f1 100644 --- a/source/profile/ProfileWorkflow.cpp +++ b/source/profile/ProfileWorkflow.cpp @@ -58,12 +58,13 @@ int runProfileWorkflow(const ProfileWorkflowParameters& parameters) Read read = readStreamer.decodeRead(); - string repeatUnit; - const ReadType readType - = classifyRead(parameters.maxMapqOfInrepeatRead(), parameters.minMapqOfAnchorRead(), read, repeatUnit); + string motif; + const ReadType readType = classifyRead( + parameters.motifSizeRange(), parameters.maxMapqOfInrepeatRead(), parameters.minMapqOfAnchorRead(), read, + motif); if (readType == ReadType::kIrrRead) { - pairCollector.addIrr(read, repeatUnit); + pairCollector.addIrr(read, motif); } else if (readType == ReadType::kAnchorRead) { @@ -87,7 +88,7 @@ int runProfileWorkflow(const ProfileWorkflowParameters& parameters) for (const auto& kv : irrRegions) { const string unit = kv.first; - if (parameters.shortestUnitToConsider() <= unit.length() && unit.length() <= parameters.longestUnitToConsider()) + if (parameters.motifSizeRange().contains(unit.length())) { units.insert(unit); } diff --git a/source/profile/ReadClassification.cpp b/source/profile/ReadClassification.cpp index 6d30cfc..0b98701 100644 --- a/source/profile/ReadClassification.cpp +++ b/source/profile/ReadClassification.cpp @@ -25,12 +25,12 @@ using std::string; -ReadType classifyRead(int max_irr_mapq, int min_anchor_mapq, const Read& read, string& unit) +ReadType classifyRead(Interval motifSizeRange, int max_irr_mapq, int min_anchor_mapq, const Read& read, string& unit) { const bool is_unmapped = read.flag & 0x4; const bool is_low_mapq = read.mapq <= max_irr_mapq; - const bool is_irr = (is_unmapped || is_low_mapq) && IsInrepeatRead(read.bases, read.quals, unit); + const bool is_irr = (is_unmapped || is_low_mapq) && IsInrepeatRead(read.bases, read.quals, unit, motifSizeRange); if (is_irr) { diff --git a/source/profile/ReadClassification.hh b/source/profile/ReadClassification.hh index 52919b3..b007f3e 100644 --- a/source/profile/ReadClassification.hh +++ b/source/profile/ReadClassification.hh @@ -23,8 +23,10 @@ #include +#include "common/Interval.hh" #include "profile/PairCollector.hh" -ReadType classifyRead(int max_irr_mapq, int min_anchor_mapq, const Read& read, std::string& unit); +ReadType +classifyRead(Interval motifSizeRange, int max_irr_mapq, int min_anchor_mapq, const Read& read, std::string& unit); PairType classifyPair(ReadType read_type, const std::string& read_unit, ReadType mate_type, const std::string& mate_unit); \ No newline at end of file diff --git a/source/purity/CMakeLists.txt b/source/purity/CMakeLists.txt deleted file mode 100644 index 8394ee5..0000000 --- a/source/purity/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_library(purity purity.cc) -target_link_libraries(purity common) -add_subdirectory(unit_tests) \ No newline at end of file diff --git a/source/purity/unit_tests/CMakeLists.txt b/source/purity/unit_tests/CMakeLists.txt deleted file mode 100644 index 34c5a05..0000000 --- a/source/purity/unit_tests/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_executable(purity_test purity_test.cc) -target_link_libraries(purity_test purity gtest gmock_main) -add_test(NAME purity_test COMMAND purity_test) \ No newline at end of file diff --git a/source/reads/IrrFinder.cpp b/source/reads/IrrFinder.cpp index 8885733..031ad29 100644 --- a/source/reads/IrrFinder.cpp +++ b/source/reads/IrrFinder.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -48,8 +49,15 @@ int32_t MaxMatchesAtOffset(int32_t offset, const std::string& bases) double MatchFrequencyAtOffset(int32_t offset, const string& bases) { - if (offset <= 0 || bases.length() / 2 + 1 <= offset) + if (offset <= 0) + { throw logic_error(to_string(offset) + " is not a valid offset for " + bases); + } + + if (bases.length() / 2 + 1 <= offset) + { + return 0; + } int32_t num_matches = 0; for (size_t position = 0; position != bases.length() - offset; ++position) @@ -61,16 +69,24 @@ double MatchFrequencyAtOffset(int32_t offset, const string& bases) return match_frequency; } -int32_t SmallestFrequentPeriod(double min_frequency, const string& bases) +int SmallestFrequentPeriod(double minFrequency, const string& bases, const Interval& periodSizeRange) { - for (int32_t offset = 1; offset != bases.length() / 2 + 1; ++offset) + const int smallestPeriod = std::max(periodSizeRange.start(), 1); + const int largestPeriod = std::min(periodSizeRange.end(), static_cast(bases.length() / 2 + 1)); + + double maxMatchFrequency = minFrequency; + int bestOffset = -1; + for (int offset = largestPeriod; offset + 1 != smallestPeriod; --offset) { const double match_frequency = MatchFrequencyAtOffset(offset, bases); - if (match_frequency >= min_frequency) - return offset; + if (match_frequency >= maxMatchFrequency) + { + maxMatchFrequency = match_frequency; + bestOffset = offset; + } } - return -1; + return bestOffset; } char ExtractConsensusBase(int32_t offset, int32_t period, const std::string& bases) @@ -97,7 +113,7 @@ char ExtractConsensusBase(int32_t offset, int32_t period, const std::string& bas return consensus_char; } -string ExtractConsensusRepeatUnit(double period, const std::string& bases) +string ExtractConsensusRepeatUnit(double period, const string& bases) { string repeat_unit; for (int32_t offset = 0; offset != period; ++offset) @@ -131,26 +147,31 @@ string ComputeCanonicalRepeatUnit(const string& unit) return minimal_unit; } -string ComputeCanonicalRepeatUnit(double min_frequency, const string& bases) +string ComputeCanonicalRepeatUnit(double minFrequency, const string& bases, const Interval& motifSizeRange) { - const int32_t period = SmallestFrequentPeriod(min_frequency, bases); + const int period = SmallestFrequentPeriod(minFrequency, bases, motifSizeRange); if (period == -1) + { return ""; - string repeat_unit = ExtractConsensusRepeatUnit(period, bases); + } + string motif = ExtractConsensusRepeatUnit(period, bases); const double kPerfectMatchFrequency = 1.0; - const int32_t reduced_period = SmallestFrequentPeriod(kPerfectMatchFrequency, repeat_unit); - if (reduced_period != -1 && reduced_period != period) - repeat_unit = ExtractConsensusRepeatUnit(reduced_period, repeat_unit); - repeat_unit = ComputeCanonicalRepeatUnit(repeat_unit); - return repeat_unit; + const int32_t reducedPeriod = SmallestFrequentPeriod(kPerfectMatchFrequency, motif); + if (reducedPeriod != -1 && reducedPeriod != period) + { + motif = ExtractConsensusRepeatUnit(reducedPeriod, motif); + } + return ComputeCanonicalRepeatUnit(motif); } -bool IsInrepeatRead(const string& bases, const string& quals, string& unit) +bool IsInrepeatRead(const string& bases, const string& quals, string& unit, const Interval& motifSizeRange) { const double min_frequency = 0.8; - unit = ComputeCanonicalRepeatUnit(min_frequency, bases); + unit = ComputeCanonicalRepeatUnit(min_frequency, bases, motifSizeRange); if (unit.empty() || unit == "N") + { return false; + } const vector units = { unit }; const vector> units_shifts = ShiftUnits(units); @@ -159,10 +180,5 @@ bool IsInrepeatRead(const string& bases, const string& quals, string& unit) score /= bases.length(); const double min_score = 0.90; - if (score < min_score) - { - return false; - } - - return true; + return score >= min_score; } \ No newline at end of file diff --git a/source/reads/IrrFinder.hh b/source/reads/IrrFinder.hh index a981732..d449b3b 100644 --- a/source/reads/IrrFinder.hh +++ b/source/reads/IrrFinder.hh @@ -27,12 +27,18 @@ #include -int32_t MaxMatchesAtOffset(int32_t offset, const std::string& bases); -double MatchFrequencyAtOffset(int32_t offset, const std::string& bases); -int32_t SmallestFrequentPeriod(double min_frequency, const std::string& bases); +#include "common/Interval.hh" + +int MaxMatchesAtOffset(int offset, const std::string& bases); +double MatchFrequencyAtOffset(int offset, const std::string& bases); +int SmallestFrequentPeriod( + double minFrequency, const std::string& bases, const Interval& periodSizeRange = Interval(1, 20)); char ExtractConsensusBase(int32_t offset, int32_t period, const std::string& bases); std::string ExtractConsensusRepeatUnit(double period, const std::string& bases); std::string MinimialUnitUnderShift(const std::string& unit); std::string ComputeCanonicalRepeatUnit(const std::string& unit); -std::string ComputeCanonicalRepeatUnit(double min_frequency, const std::string& bases); -bool IsInrepeatRead(const std::string& bases, const std::string& quals, std::string& unit); \ No newline at end of file +std::string ComputeCanonicalRepeatUnit( + double minFrequency, const std::string& bases, const Interval& motifSizeRange = Interval(1, 20)); +bool IsInrepeatRead( + const std::string& bases, const std::string& quals, std::string& unit, + const Interval& motifSizeRange = Interval(1, 20)); \ No newline at end of file diff --git a/source/tests/IrrFinderTest.cpp b/source/tests/IrrFinderTest.cpp new file mode 100644 index 0000000..0c82c56 --- /dev/null +++ b/source/tests/IrrFinderTest.cpp @@ -0,0 +1,222 @@ +// +// ExpansionHunter Denovo +// Copyright 2016-2019 Illumina, Inc. +// All rights reserved. +// +// Author: Egor Dolzhenko +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// + +#include "reads/IrrFinder.hh" + +#include + +#include "thirdparty/catch2/catch.hpp" + +using Catch::Contains; +using std::string; +using std::vector; + +TEST_CASE("Match count calculated for various offsets", "[Determining motif]") +{ + const string bases = "ATCGATCG"; + const size_t num_bases = bases.length(); + REQUIRE(MaxMatchesAtOffset(0, bases) == num_bases); + REQUIRE(MaxMatchesAtOffset(1, bases) == num_bases - 1); + REQUIRE(MaxMatchesAtOffset(2, bases) == num_bases - 2); + REQUIRE(MaxMatchesAtOffset(num_bases, bases) == 0); + REQUIRE(MaxMatchesAtOffset(num_bases + 1, bases) == 0); +} + +TEST_CASE("Match frequency calculated for various offsets", "[Determining motif]") +{ + const string bases = "GGCCCCGGCCCC"; + vector expected_frequencies = { 0.73, 0.40, 0.33, 0.25, 0.57, 1.00 }; + + for (int offset = 1; offset != 7; ++offset) + { + const double expected_frequency = expected_frequencies[offset - 1]; + REQUIRE(MatchFrequencyAtOffset(offset, bases) == Approx(expected_frequency).margin(0.01)); + } +} + +TEST_CASE("Match frequency calculated for imperfect repeat", "[Determining motif]") +{ + const string bases = "ATGATCATGTTGATG"; + const double expected_frequency = 8 / 12.0; + REQUIRE(MatchFrequencyAtOffset(3, bases) == Approx(expected_frequency)); +} + +TEST_CASE("Match frequency calculation throws for on invalid offsets", "[Determining motif]") +{ + const string bases = "GGCCCCGGCCCC"; + const int num_bases = bases.length(); + REQUIRE_THROWS_WITH(MatchFrequencyAtOffset(-1, bases), Contains("not a valid offset")); + REQUIRE_THROWS_WITH(MatchFrequencyAtOffset(0, bases), Contains("not a valid offset")); + // REQUIRE_THROWS_WITH(MatchFrequencyAtOffset(num_bases / 2 + 1, bases), Contains("not a valid offset")); +} + +TEST_CASE("Calculating period for typical sequences", "[Determining motif]") +{ + { + const double min_frequency = 0.85; + const string bases = "GGCCCCGGCCCC"; + const int period = SmallestFrequentPeriod(min_frequency, bases); + REQUIRE(period == 6); + } + + { + const double min_frequency = 0.85; + const string bases = "ATGATCATGATGATGATGATG"; + const int period = SmallestFrequentPeriod(min_frequency, bases); + REQUIRE(period == 6); // 3 might be a better answers + } +} + +TEST_CASE("-1 is used as a special value for no period", "[Determining motif]") +{ + const double min_frequency = 0.85; + const string bases = "ATCGGCTA"; + const int period = SmallestFrequentPeriod(min_frequency, bases); + REQUIRE(period == -1); +} + +TEST_CASE("Consensus base is extracted a sequence", "[Determining motif]") +{ + const string bases = "CGATGACTG"; + const int period = 3; + REQUIRE(ExtractConsensusBase(0, period, bases) == 'C'); + REQUIRE(ExtractConsensusBase(1, period, bases) == 'G'); + REQUIRE(ExtractConsensusBase(2, period, bases) == 'A'); +} + +TEST_CASE("Consensus motif is extracted for a sequence", "[Determining motif]") +{ + { + const string bases = "CGGCGGCGG"; + const int period = 3; + REQUIRE(ExtractConsensusRepeatUnit(period, bases) == "CGG"); + } + { + const string bases = "CGGATTATTATTCGG"; + const int period = 3; + REQUIRE(ExtractConsensusRepeatUnit(period, bases) == "ATT"); + } +} + +TEST_CASE("Minimal motif under shift is computed a motif", "[Determining motif]") +{ + const string repeat_unit = "GGC"; + REQUIRE(MinimialUnitUnderShift(repeat_unit) == "CGG"); +} + +TEST_CASE("Canonical motif computed for a motif", "[Determining motif]") +{ + { + const string repeat_unit = "CGG"; + REQUIRE(ComputeCanonicalRepeatUnit(repeat_unit) == "CCG"); + } + { + const string repeat_unit = "GCC"; + REQUIRE(ComputeCanonicalRepeatUnit(repeat_unit) == "CCG"); + } +} + +TEST_CASE("ComputeCanonicalRepeatUnitForRead, TypicalInrepeatReadsDetected", "[Determining motif]") +{ + { + const string bases = "CGGCGCCGGCGG"; + const double min_frequency = 0.8; + REQUIRE(ComputeCanonicalRepeatUnit(min_frequency, bases) == "CCG"); + REQUIRE(ComputeCanonicalRepeatUnit(min_frequency + 0.05, bases).empty()); + } + + { + const string bases = "ACCCCAACCCCAACCCCAACCCCAACCCCAACCCCA"; + const double min_frequency = 0.8; + REQUIRE(ComputeCanonicalRepeatUnit(min_frequency, bases) == "AACCCC"); + } +} + +TEST_CASE("Compute canonical motif for homopolymer run", "[Determining motif]") +{ + const string bases = "CCCCCCC"; + const double min_frequency = 1.0; + REQUIRE(ComputeCanonicalRepeatUnit(min_frequency, bases) == "C"); +} + +TEST_CASE("Irr check is performed on various reads", "[Determining motif]") +{ + { + const string irr = "CCCCC"; + const string quals = "$$$$$"; + string unit; + REQUIRE(IsInrepeatRead(irr, quals, unit)); + REQUIRE(unit == "C"); + } + { + const string not_irr = "AAAAACCCCC"; + const string quals = "$$$$$$$$$$"; + string unit; + REQUIRE(!IsInrepeatRead(not_irr, quals, unit)); + } + { + const string irr = "TCCACCCACCTCACCCCCCCCCCCCCCCGCCCCCCCCCCACCCCCCCCGCCCCCCCCCCCGGCCCCCCACTCCCCCCCCCCGGTCCTCCCC" + "CCCCCCCACCCTCCCCCCCCGCCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCC"; + const string quals = "------7----7-----7-777-7-F<--777F777F