Skip to content

Commit

Permalink
fix: underflow in HaplotypeProbabilitiesFromContaminatorSequence (#1697)
Browse files Browse the repository at this point in the history
* fix: underflow in HaplotypeProbabilitiesFromContaminatorSequence, which affects both IdentifyContamination and ExtractFingerprint in deep coverage. This was leading to non-informative sites in RNA and similar sequencing data.
  • Loading branch information
Yossi Farjoun authored Jul 14, 2021
1 parent 7172f0b commit e7cda05
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 15 deletions.
4 changes: 3 additions & 1 deletion src/main/java/picard/fingerprint/FingerprintUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package picard.fingerprint;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
Expand Down Expand Up @@ -143,7 +144,8 @@ public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fin
return variantContexts;
}

private static VariantContext getVariantContext(final ReferenceSequenceFile reference,
@VisibleForTesting
static VariantContext getVariantContext(final ReferenceSequenceFile reference,
final String sample,
final HaplotypeProbabilities haplotypeProbabilities) {
final Snp snp = haplotypeProbabilities.getRepresentativeSnp();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ public class HaplotypeProbabilitiesFromContaminatorSequence extends HaplotypePro

public double contamination;

// for each model (contGenotype, mainGenotype) there's a likelihood of the data. These need to be collected separately
// for each model (contGenotype, mainGenotype) there's a LogLikelihood of the data. These need to be collected separately
// and only collated once all the data is in.
private final double[][] likelihoodMap = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
private final double[][] logLikelihoodMap = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
private boolean valuesNeedUpdating = true;

public HaplotypeProbabilitiesFromContaminatorSequence(final HaplotypeBlock haplotypeBlock, final double contamination) {
Expand All @@ -58,7 +58,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence(HaplotypeProbabilitiesFrom

contamination = other.contamination;
for (final Genotype g : Genotype.values()) {
System.arraycopy(other.likelihoodMap[g.v], 0, likelihoodMap[g.v], 0, NUM_GENOTYPES);
System.arraycopy(other.logLikelihoodMap[g.v], 0, logLikelihoodMap[g.v], 0, NUM_GENOTYPES);
}
}

Expand Down Expand Up @@ -92,9 +92,9 @@ public void addToProbs(final Snp snp, final byte base, final byte qual) {
for (final Genotype mainGeno : Genotype.values()) {
// theta is the expected frequency of the alternate allele
final double theta = 0.5 * ((1 - contamination) * mainGeno.v + contamination * contGeno.v);
likelihoodMap[contGeno.v][mainGeno.v] *=
(( altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr );
logLikelihoodMap[contGeno.v][mainGeno.v] +=
Math.log10((( altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr ));
}
}
}
Expand All @@ -108,7 +108,7 @@ private void updateLikelihoods() {
final double[] ll = new double[NUM_GENOTYPES];
for (final Genotype contGeno : Genotype.values()) {
// p(a | g_c) = \sum_g_m { P(g_m) \prod_i P(a_i| g_m, g_c)}
ll[contGeno.v] = Math.log10(Double.MIN_VALUE + MathUtil.sum(MathUtil.multiply(this.getPriorProbablities(), likelihoodMap[contGeno.v])));
ll[contGeno.v] = MathUtil.LOG_10_MATH.sum( MathUtil.sum(MathUtil.LOG_10_MATH.getLogValue(this.getPriorProbablities()), logLikelihoodMap[contGeno.v]));
}
setLogLikelihoods(ll);
}
Expand Down Expand Up @@ -137,7 +137,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence merge(final HaplotypeProba
}

for (final Genotype contGeno : Genotype.values()) {
this.likelihoodMap[contGeno.v] = MathUtil.multiply(this.likelihoodMap[contGeno.v], o.likelihoodMap[contGeno.v]);
this.logLikelihoodMap[contGeno.v] = MathUtil.sum(this.logLikelihoodMap[contGeno.v], o.logLikelihoodMap[contGeno.v]);
}
valuesNeedUpdating = true;
return this;
Expand Down
43 changes: 39 additions & 4 deletions src/test/java/picard/fingerprint/FingerprintCheckerTest.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
package picard.fingerprint;

import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.PicardException;
import picard.util.MathUtil;
import picard.vcf.VcfTestUtils;

import java.io.File;
Expand All @@ -27,14 +31,15 @@
import java.util.stream.Collectors;

import static picard.util.TestNGUtil.compareDoubleWithAccuracy;
import static picard.util.TestNGUtil.interaction;

/**
* Created by farjoun on 8/27/15.
*/
public class FingerprintCheckerTest {

private final double maf = 0.4;
private final Snp snp = new Snp("test", "chr1", 1, (byte) 'A', (byte) 'C', maf, Collections.singletonList("dummy"));
private final Snp snp = new Snp("test", "chr1", 1, (byte) 'A', (byte) 'T', maf, Collections.singletonList("dummy"));
private final HaplotypeBlock hb = new HaplotypeBlock(maf);

private static final double DELTA = 1e-6;
Expand Down Expand Up @@ -316,8 +321,38 @@ public void testQueryable(final File vcf, boolean expectedQueryable) {
}
}

Object[][] deepData() {
return new Object[][]{{10}, {50}, {100}, {500}, {1000}, {5000}, {10000}, {50000}};
}

Object[][] haplotypeProbabilities() {
return new Object[][]{{new HaplotypeProbabilitiesFromSequence(hb)}, {new HaplotypeProbabilitiesFromContaminatorSequence(hb, 0.999)}};
}

@DataProvider
Iterator<Object[]> deepDataProvider(){
return interaction(haplotypeProbabilities(), deepData());
}

@Test(dataProvider = "deepDataProvider")
void testCanHandleDeepData(final HaplotypeProbabilitiesFromSequence hp, final int counts) throws IOException {

addObservation(hp, hb, counts, hb.getFirstSnp().getAllele1());
addObservation(hp, hb, counts, hb.getFirstSnp().getAllele2());

final double[] logLikelihoods = hp.getLogLikelihoods();
Assert.assertTrue( MathUtil.min(logLikelihoods) < 0 );

final File fasta = new File(TEST_DATA_DIR, "reference.fasta");

try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta)) {
final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp);
Assert.assertTrue(MathUtil.max(MathUtil.promote(vc.getGenotype(0).getPL())) > 0);
}
}

@DataProvider()
Object[][] mergeIsDafeProvider() {
Object[][] mergeIsSafeProvider() {
final HaplotypeProbabilitiesFromSequence hp1 = new HaplotypeProbabilitiesFromSequence(hb);
final HaplotypeProbabilitiesFromSequence hp2 = new HaplotypeProbabilitiesFromSequence(hb);

Expand Down Expand Up @@ -353,7 +388,7 @@ private static void addObservation(final HaplotypeProbabilitiesFromSequence hapl
}
}

@Test(dataProvider = "mergeIsDafeProvider")
@Test(dataProvider = "mergeIsSafeProvider")
public void testMergeHaplotypeProbabilitiesIsSafe(final HaplotypeProbabilities hp1,
final HaplotypeProbabilities hp2) {

Expand All @@ -363,7 +398,7 @@ public void testMergeHaplotypeProbabilitiesIsSafe(final HaplotypeProbabilities h
Assert.assertEquals(merged1.getLikelihoods(), merged2.getLikelihoods());
}

@Test(dataProvider = "mergeIsDafeProvider")
@Test(dataProvider = "mergeIsSafeProvider")
public void testMergeFingerprintIsSafe(final HaplotypeProbabilities hp1, final HaplotypeProbabilities hp2) {

final Fingerprint fpA = new Fingerprint("test2", null, "none");
Expand Down
17 changes: 15 additions & 2 deletions src/test/java/picard/util/TestNGUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,24 @@
*/
public class TestNGUtil {

// Not actually the smallest possible value on purpose, since that would be indistinguishable from 0 and then useless.
// This is small enough to be meaningless, but representable.
static final double EPSILON = 1e-300; //a constant near the smallest possible positive value representable in double,

// not actually the smallest possible value on purpose, since that would be indistinguishable from 0 and then useless.
// This is small enough to be meaningless, but representable.
// A method that takes two Object[][] (normally from two data providers) and creates a third of all possible combinations
public static Iterator<Object[]> interaction(final Object[][] dataprovider1, final Object[][] dataprovider2) {
final List<Object[]> testcases = new ArrayList<>(dataprovider1.length * dataprovider2.length);

for (final Object[] objects1 : dataprovider1) {
for (final Object[] objects2 : dataprovider2) {
final Object[] objects = new Object[objects1.length + objects2.length];
System.arraycopy(objects1, 0, objects, 0, objects1.length);
System.arraycopy(objects2, 0, objects, objects1.length, objects2.length);
testcases.add(objects);
}
}
return testcases.iterator();
}

public static abstract class SingleTestUnitTest<TestClazz extends TestNGParameterizable> {
@DataProvider(name = "testcases")
Expand Down

0 comments on commit e7cda05

Please sign in to comment.