Skip to content

Commit

Permalink
separate alleles in M2 isActive and account for one-third substitutio…
Browse files Browse the repository at this point in the history
…n factor
  • Loading branch information
davidbenjamin committed Sep 10, 2020
1 parent 3502d44 commit e77cf16
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import it.unimi.dsi.fastutil.bytes.ByteArrayList;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import org.apache.commons.lang3.mutable.MutableLong;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.FastMath;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -51,6 +54,7 @@
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* Created by davidben on 9/15/16.
Expand Down Expand Up @@ -105,6 +109,9 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator {

private final Optional<F1R2CountsCollector> f1R2CountsCollector;

private PileupQualBuffer tumorPileupQualBuffer = new PileupQualBuffer();
private PileupQualBuffer normalPileupQualBuffer = new PileupQualBuffer();

/**
* Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header,
* and a reference file
Expand Down Expand Up @@ -358,6 +365,9 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
if ( forceCallingAllelesPresent && features.getValues(MTAC.alleles, ref).stream().anyMatch(vc -> MTAC.forceCallFiltered || vc.isNotFiltered())) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
if (ref.getStart() == 2916255) {
int f = 90;
}

final byte refBase = ref.getBase();
final SimpleInterval refInterval = ref.getInterval();
Expand All @@ -372,18 +382,22 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
}
final ReadPileup tumorPileup = pileup.makeFilteredPileup(pe -> isTumorSample(ReadUtils.getSampleName(pe.getRead(), header)));
f1R2CountsCollector.ifPresent(collector -> collector.process(tumorPileup, ref));
final List<Byte> tumorAltQuals = altQuals(tumorPileup, refBase, MTAC.pcrSnvQual);
final double tumorLogOdds = logLikelihoodRatio(tumorPileup.size() - tumorAltQuals.size(), tumorAltQuals);
tumorPileupQualBuffer.accumulateQuals(tumorPileup, refBase, MTAC.pcrSnvQual);
final Pair<Integer, ByteArrayList> bestTumorAltAllele = tumorPileupQualBuffer.likeliestIndexAndQuals();
final double tumorLogOdds = logLikelihoodRatio(tumorPileup.size() - bestTumorAltAllele.getRight().size(), bestTumorAltAllele.getRight());

if (tumorLogOdds < MTAC.getInitialLogOdds()) {
return new ActivityProfileState(refInterval, 0.0);
} else if (hasNormal() && !MTAC.genotypeGermlineSites) {
final ReadPileup normalPileup = pileup.makeFilteredPileup(pe -> isNormalSample(ReadUtils.getSampleName(pe.getRead(), header)));
final List<Byte> normalAltQuals = altQuals(normalPileup, refBase, MTAC.pcrSnvQual);
final int normalAltCount = normalAltQuals.size();
final double normalQualSum = normalAltQuals.stream().mapToDouble(Byte::doubleValue).sum();
if (normalAltCount > normalPileup.size() * MAX_ALT_FRACTION_IN_NORMAL && normalQualSum > MAX_NORMAL_QUAL_SUM) {
return new ActivityProfileState(refInterval, 0.0);
normalPileupQualBuffer.accumulateQuals(normalPileup, refBase, MTAC.pcrSnvQual);
final Pair<Integer, ByteArrayList> bestNormalAltAllele = normalPileupQualBuffer.likeliestIndexAndQuals();
if (bestNormalAltAllele.getLeft() == bestNormalAltAllele.getLeft()) {
final int normalAltCount = bestNormalAltAllele.getRight().size();
final double normalQualSum = normalPileupQualBuffer.qualSum(bestNormalAltAllele.getLeft());
if (normalAltCount > normalPileup.size() * MAX_ALT_FRACTION_IN_NORMAL && normalQualSum > MAX_NORMAL_QUAL_SUM) {
return new ActivityProfileState(refInterval, 0.0);
}
}
} else if (!MTAC.genotypeGermlineSites) {
final List<VariantContext> germline = features.getValues(MTAC.germlineResource, refInterval);
Expand Down Expand Up @@ -458,33 +472,10 @@ private static byte indelQual(final int indelLength) {
return (byte) Math.min(INDEL_START_QUAL + (indelLength - 1) * INDEL_CONTINUATION_QUAL, Byte.MAX_VALUE);
}

private static List<Byte> altQuals(final ReadPileup pileup, final byte refBase, final int pcrErrorQual) {
final List<Byte> result = new ArrayList<>();
final int position = pileup.getLocation().getStart();

for (final PileupElement pe : pileup) {
final int indelLength = getCurrentOrFollowingIndelLength(pe);
if (indelLength > 0) {
result.add(indelQual(indelLength));
} else if (isNextToUsefulSoftClip(pe)) {
result.add(indelQual(1));
} else if (pe.getBase() != refBase && pe.getQual() > MINIMUM_BASE_QUALITY) {
final GATKRead read = pe.getRead();
final int mateStart = (!read.isProperlyPaired() || read.mateIsUnmapped()) ? Integer.MAX_VALUE : read.getMateStart();
final boolean overlapsMate = mateStart <= position && position < mateStart + read.getLength();
result.add(overlapsMate ? (byte) FastMath.min(pe.getQual(), pcrErrorQual/2) : pe.getQual());
}
}

return result;
}

public static double logLikelihoodRatio(final int refCount, final List<Byte> altQuals) {
return logLikelihoodRatio(refCount, altQuals, 1);
}



/**
* this implements the isActive() algorithm described in docs/mutect/mutect.pdf
* the multiplicative factor is for the special case where we pass a singleton list
Expand Down Expand Up @@ -552,4 +543,81 @@ private void checkSampleInBamHeader(final String sample) {
private String decodeSampleNameIfNecessary(final String name) {
return samplesList.asListOfSamples().contains(name) ? name : IOUtils.urlDecode(name);
}

/**
* A resuable container class to accumulate qualities for each type of SNV and indels (all indels combined)
*/
private static class PileupQualBuffer {
private static final int OTHER_SUBSTITUTION = 4;
private static final int INDEL = 5;

// our pileup likelihoods models assume that the qual corresponds to the probability that a ref base is misread
// as the *particular* alt base, whereas the qual actually means the probability of *any* substitution error.
// since there are three possible substitutions for each ref base we must divide the error probability by three
// which corresponds to adding 10*log10(3) = 4.77 ~ 5 to the qual.
private static final int ONE_THIRD_QUAL_CORRECTION = 5;

// indices 0-3 are A,C,G,T; 4 is other substitution (just in case it's some exotic protocol); 5 is indel
private List<ByteArrayList> buffers = IntStream.range(0,6).mapToObj(n -> new ByteArrayList()).collect(Collectors.toList());

public PileupQualBuffer() { }

public void accumulateQuals(final ReadPileup pileup, final byte refBase, final int pcrErrorQual) {
clear();
final int position = pileup.getLocation().getStart();

for (final PileupElement pe : pileup) {
final int indelLength = getCurrentOrFollowingIndelLength(pe);
if (indelLength > 0) {
accumulateIndel(indelQual(indelLength));
} else if (isNextToUsefulSoftClip(pe)) {
accumulateIndel(indelQual(1));
} else if (pe.getBase() != refBase && pe.getQual() > MINIMUM_BASE_QUALITY) {
final GATKRead read = pe.getRead();
final int mateStart = (!read.isProperlyPaired() || read.mateIsUnmapped()) ? Integer.MAX_VALUE : read.getMateStart();
final boolean overlapsMate = mateStart <= position && position < mateStart + read.getLength();
accumulateSubstitution(pe.getBase(), overlapsMate ? (byte) FastMath.min(pe.getQual(), pcrErrorQual/2) : pe.getQual());
}
}
}

public Pair<Integer, ByteArrayList> likeliestIndexAndQuals() {
int bestIndex = 0;
long bestSum = 0;
for (int n = 0; n < buffers.size(); n++) {
final long sum = qualSum(n);
if (sum > bestSum) {
bestSum = sum;
bestIndex = n;
}
}
return ImmutablePair.of(bestIndex, buffers.get(bestIndex));
}

private void accumulateSubstitution(final byte base, final byte qual) {
final int index = BaseUtils.simpleBaseToBaseIndex(base);
if (index == -1) { // -1 is the hard-coded value for non-simple bases in BaseUtils
buffers.get(OTHER_SUBSTITUTION).add(qual);
} else {
buffers.get(index).add((byte) FastMath.min(qual + ONE_THIRD_QUAL_CORRECTION, QualityUtils.MAX_QUAL));
}
}

private void accumulateIndel(final byte qual) {
buffers.get(INDEL).add(qual);
}

private void clear() {
buffers.forEach(ByteArrayList::clear);
}

public long qualSum(final int index) {
final ByteArrayList list = buffers.get(index);
long result = 0;
for (int n = 0; n < list.size(); n++) {
result += list.getByte(n);
}
return result;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.apache.commons.math.special.Beta;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
Expand All @@ -17,6 +18,10 @@ public class Mutect2EngineUnitTest extends GATKBaseTest {
* Test the active region log likelihood ratio to lowest order. From the Mutect docs the no-variation likelihood is
* prod_alt(eps_j), where eps_j is the error probability of read j and alt means "over all alt reads".
*
* Note that, as in the docs, this assumes the approximation that ref reads have infinite quality -- that is, we
* don't try to squeeze out the last bit of extra variant likelihood by accounting for the chance that some alt reads
* were actually misread as ref reads.
*
* The likelihood for a variant with allele fraction f is (1-f)^N_ref * prod_alt [f(1-eps_j) + (1-f)eps_j].
*
* To leading order in epsilon the log likelihood ratio is
Expand Down Expand Up @@ -62,4 +67,82 @@ public Object[][] getLeadingOrderData() {
};
}

/**
* We can also (again using the perfect ref reads approximation) exactly calculate the integral when the alt count is small.
* Assuming a constant error rate eps for the alt reads we have a variant likelihood
* int_(0 to 1) (1-f)^N_ref * [f(1-eps_j) + (1-f)eps_j]^N_alt df
*
* We can expand the binomial raised to the N_alt power explicitly for small N_alt to obtain the sum of N_alt terms
* in the integrand, each of which is the normalization constant of a Beta distribution with integer shape parameters.
* @param numRef
* @param errorRate
*/
@Test(dataProvider = "fewAltData")
public void testSmallNumAltExact(final int numRef, final double errorRate) {

for (final int numAlt : new int[] {0, 1, 2}) {
final double calculated = Mutect2Engine.logLikelihoodRatio(numRef, numAlt, errorRate);
final double expected;
switch (numAlt) {
case 0:
expected = Math.log(1.0 / (numRef + 1));;
break;
case 1:
expected = Math.log((errorRate / (numRef + 2)) + (1 - errorRate) /((numRef + 1) * (numRef + 2)))
- Math.log(errorRate);
break;
case 2:
expected = Math.log( (errorRate*errorRate/(numRef + 3)) + (2*errorRate*(1-errorRate)/((numRef+2)*(numRef+3)))
+ (2*(1-errorRate)*(1-errorRate)/((numRef+1)*(numRef+2)*(numRef+3)))) - 2*Math.log(errorRate);
break;
default:
throw new GATKException.ShouldNeverReachHereException("Didn't write this test case");

}

// we don't really care about high accuracy if things are obvious:
if (expected < - 3.0) {
Assert.assertTrue(calculated < -1.0);
continue;
}

final double precision;
if (expected < -2) {
precision = 2.0;
} else if (expected < 0) {
precision = 1.0;
} else if (expected < 1) {
precision = 0.5;
} else {
precision = 0.25;
}

Assert.assertEquals(calculated, expected, precision);
}



}

@DataProvider(name = "fewAltData")
public Object[][] getFewAltData() {
return new Object[][] {
{ 1, 0.0001},
{ 5, 0.0001},
{ 10, 0.0001},
{ 100, 0.0001},
{ 1000, 0.0001},
{ 1, 0.001},
{ 5, 0.001},
{ 10, 0.001},
{ 100, 0.001},
{ 1000, 0.001},
{ 1, 0.01},
{ 5, 0.01},
{ 10, 0.01},
{ 100, 0.01},
{ 1000, 0.01},
};
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ public void testDreamTumorNormal(final File tumor, final Optional<File> normal,
args -> runOrientationFilter ? args.add(M2FiltersArgumentCollection.ARTIFACT_PRIOR_TABLE_NAME, orientationModel) : args);

final File concordanceSummary = createTempFile("concordance", ".txt");
runConcordance(truth, filteredVcf,concordanceSummary, CHROMOSOME_20, mask);
final File truePositivesFalseNegatives = createTempFile("tpfn", ".vcf");
runConcordance(truth, filteredVcf,concordanceSummary, CHROMOSOME_20, mask, Optional.of (truePositivesFalseNegatives));

final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList();
summaryRecords.forEach(rec -> {
Expand Down Expand Up @@ -216,7 +217,7 @@ public void testTwoDreamTumorSamples(final List<File> tumors, final List<File> n
runFilterMutectCalls(unfilteredVcf, filteredVcf, b37Reference);

final File concordanceSummary = createTempFile("concordance", ".txt");
runConcordance(truth, filteredVcf, concordanceSummary, CHROMOSOME_20, mask);
runConcordance(truth, filteredVcf, concordanceSummary, CHROMOSOME_20, mask, Optional.empty());

final List<ConcordanceSummaryRecord> summaryRecords = new ConcordanceSummaryRecord.Reader(concordanceSummary.toPath()).toList();
summaryRecords.forEach(rec -> {
Expand Down Expand Up @@ -882,13 +883,15 @@ final private void runFilterMutectCalls(final File unfilteredVcf, final File fil
runCommandLine(argsWithAdditions, FilterMutectCalls.class.getSimpleName());
}

private void runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask) {
private void runConcordance(final File truth, final File eval, final File summary, final String interval, final File mask, final Optional<File> truePositivesFalseNegatives) {
final ArgumentsBuilder concordanceArgs = new ArgumentsBuilder()
.add(Concordance.TRUTH_VARIANTS_LONG_NAME, truth)
.add(Concordance.EVAL_VARIANTS_LONG_NAME, eval)
.addInterval(new SimpleInterval(interval))
.add(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, mask)
.add(Concordance.SUMMARY_LONG_NAME, summary);

truePositivesFalseNegatives.ifPresent(file -> concordanceArgs.add(Concordance.TRUE_POSITIVES_AND_FALSE_NEGATIVES_SHORT_NAME, file));
runCommandLine(concordanceArgs, Concordance.class.getSimpleName());
}

Expand Down

0 comments on commit e77cf16

Please sign in to comment.