Skip to content

Commit

Permalink
1. Add an option to count reads, not fragments, in strand bias annota…
Browse files Browse the repository at this point in the history
…tions. 2. Add an option to turn off base quality correction before local reassembly
  • Loading branch information
takutosato committed Oct 11, 2018
1 parent ce669d1 commit d3d5f3e
Show file tree
Hide file tree
Showing 14 changed files with 185 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
import org.apache.commons.math3.distribution.BetaDistribution;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.Collectors;

import static org.broadinstitute.hellbender.tools.walkers.annotator.StrandArtifact.ArtifactState.*;

Expand Down Expand Up @@ -88,13 +90,22 @@ public void annotate(final ReferenceContext ref,
return;
}
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAlelle = vc.getAlternateAllele(indexOfMaxTumorLod);

final Collection<ReadLikelihoods<Allele>.BestAllele> bestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName());
final int numFwdAltReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numRevAltReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numFwdReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative()).count();
final int numRevReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative()).count();
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);

final Collection<ReadLikelihoods<Allele>.BestAllele> informativeBestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName()).stream().
filter(ba -> ba.isInformative()).collect(Collectors.toList());
final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> altReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(altAllele))
.collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD));
final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> refReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(vc.getReference()))
.collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD));

final int numFwdAltReads = countReads(altReads, Strand.FWD);
final int numRevAltReads = countReads(altReads, Strand.REV);
final int numFwdRefReads = countReads(refReads, Strand.FWD);
final int numRevRefReads = countReads(refReads, Strand.REV);

final int numFwdReads = numFwdRefReads + numFwdAltReads;
final int numRevReads = numRevRefReads + numRevAltReads;
final int numAltReads = numFwdAltReads + numRevAltReads;
final int numReads = numFwdReads + numRevReads;

Expand Down Expand Up @@ -162,6 +173,10 @@ public void annotate(final ReferenceContext ref,
gb.attribute(MAP_ALLELE_FRACTIONS_KEY, estimatedAlleleFractions.values().stream().mapToDouble(Double::doubleValue).toArray());
}

private enum Strand {
FWD, REV
}

@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(POSTERIOR_PROBABILITIES_KEY, 3, VCFHeaderLineType.Float, "posterior probabilities of the presence of strand artifact"),
Expand All @@ -177,5 +192,19 @@ private double getIntegrandGivenArtifact(final double f, final double epsilon, f
MathUtils.binomialProbability(nNoArtifact, xNoArtifact, f);
}

/**
*
* Count the number of reads in {@param reads}, including the overlapping mates discarded by {@link SomaticGenotypingEngine}.
* @param strand is the strand of the reads we're counting. For example, if strand = Strand.FWD, the we look among {@param reads}
* for reverse reads with {@link SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG} set to 1
*/
private int countReads(final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> reads, final Strand strand){
final Strand strandOfKeptRead = strand == Strand.FWD ? Strand.REV : Strand.FWD;

// check of the key in case there are no forward or reverse in {@link reads}
final int numDiscardedReads = ! reads.containsKey(strandOfKeptRead) ? 0 : (int) reads.get(strandOfKeptRead).stream().filter(ba -> ba.read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)).count();
final int numReads = ! reads.containsKey(strand) ? 0 : reads.get(strand).size();
return numReads + numDiscardedReads;

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
Expand Down Expand Up @@ -170,8 +171,14 @@ private static void updateTable(final int[] table, final Allele allele, final GA
final int offset = matchesRef ? 0 : ARRAY_DIM;

// a normal read with an actual strand
final boolean isFW = !read.isReverseStrand();
table[offset + (isFW ? 0 : 1)]++;
final boolean isForward = !read.isReverseStrand();
table[offset + (isForward ? 0 : 1)]++;

// This read's mate got discarded by SomaticGenotypingEngine::clipOverlappingReads()
if (read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)){
// Note that if this read is forward, then we increment its mate which we assume is reverse
table[offset + (isForward ? 1 : 0)]++;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";

@ArgumentCollection
public AssemblyRegionTrimmerArgumentCollection assemblyRegionTrimmerArgs = new AssemblyRegionTrimmerArgumentCollection();
Expand Down Expand Up @@ -123,4 +124,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall
@Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true)
public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA;

@Argument(fullName = CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME)
public boolean doNotCorrectOverlappingBaseQualities = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ public static void finalizeRegion(final AssemblyRegion region,
final boolean dontUseSoftClippedBases,
final byte minTailQuality,
final SAMFileHeader readsHeader,
final SampleList samplesList) {
final SampleList samplesList,
final boolean correctOverlappingBaseQualities) {
if ( region.isFinalized() ) {
return;
}
Expand Down Expand Up @@ -98,7 +99,7 @@ public static void finalizeRegion(final AssemblyRegion region,
Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader)); // TODO: sort may be unnecessary here

// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader);
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, correctOverlappingBaseQualities);

region.clearReads();
region.addAll(readsToUse);
Expand All @@ -109,12 +110,14 @@ public static void finalizeRegion(final AssemblyRegion region,
* Clean up reads/bases that overlap within read pairs
*
* @param reads the list of reads to consider
* @param correctOverlappingBaseQualities
*/
private static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader) {
private static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader,
final boolean correctOverlappingBaseQualities) {
for ( final List<GATKRead> perSampleReadList : splitReadsBySample(samplesList, readsHeader, reads).values() ) {
final FragmentCollection<GATKRead> fragmentCollection = FragmentCollection.create(perSampleReadList);
for ( final List<GATKRead> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair, correctOverlappingBaseQualities);
}
}
}
Expand Down Expand Up @@ -238,7 +241,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
final ReferenceSequenceFile referenceReader,
final ReadThreadingAssembler assemblyEngine,
final SmithWatermanAligner aligner){
finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList);
finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList, ! argumentCollection.doNotCorrectOverlappingBaseQualities);
if( argumentCollection.debug) {
logger.info("Assembling " + region.getSpan() + " with " + region.size() + " reads: (with overlap region = " + region.getExtendedSpan() + ")");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion r
//TODO - why the activeRegion cannot manage its own one-time finalization and filtering?
//TODO - perhaps we can remove the last parameter of this method and the three lines bellow?
if ( needsToBeFinalized ) {
finalizeRegion(region);
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList, ! hcArgs.doNotCorrectOverlappingBaseQualities);
}
filterNonPassingReads(region);

Expand Down Expand Up @@ -723,10 +723,6 @@ public void shutdown() {

}

private void finalizeRegion(final AssemblyRegion region) {
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList);
}

private Set<GATKRead> filterNonPassingReads( final AssemblyRegion activeRegion ) {
// TODO: can we unify this additional filtering with makeStandardHCReadFilter()?

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts";
public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors";
public static final String GET_AF_FROM_AD_LONG_NAME = "get-af-from-ad";
public static final String ANNOTATE_BASED_ON_READS_LONG_NAME = "annotate-fragments";

public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8;
public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6;
Expand Down Expand Up @@ -169,4 +170,12 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = GET_AF_FROM_AD_LONG_NAME, doc="Use allelic depth to calculate tumor allele fraction; recommended for mitochondrial applications", optional = true)
public boolean calculateAFfromAD = false;

/**
* If set to true, count an overlapping read pair as two separate reads instead of one for {@link StrandArtifact} and {@link StrandBiasBySample} annotations,
* which is the correct behavior for these annotations. Note that doing so would break the independence assumption of reads and over-count the alt depth in these annotations.
* On the other hand it could prevent spurious false negatives that could arise if by chance one strand in overlapping pairs is dropped disproportionately
*/
@Argument(fullName = ANNOTATE_BASED_ON_READS_LONG_NAME, doc = "If true, strand-based annotations use the number of reads, not fragments")
public boolean annotateBasedOnReads = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion, allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
return NO_CALLS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.StrandBiasTest;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerGenotypingEngine;
Expand All @@ -37,6 +38,7 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine
public final String tumorSample;
private final String normalSample;
final boolean hasNormal;
public static final String DISCARDED_MATE_READ_TAG = "DM";

// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
Expand Down Expand Up @@ -226,8 +228,8 @@ private PerAlleleCollection<Double> somaticLog10Odds(final LikelihoodMatrix<Alle
}

private void addGenotypes(final LikelihoodMatrix<Allele> tumorLog10Matrix,
final Optional<LikelihoodMatrix<Allele>> normalLog10Matrix,
final VariantContextBuilder callVcb) {
final Optional<LikelihoodMatrix<Allele>> normalLog10Matrix,
final VariantContextBuilder callVcb) {
final double[] tumorAlleleCounts = getEffectiveCounts(tumorLog10Matrix);
final int[] adArray = Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray();
final int dp = (int) MathUtils.sum(adArray);
Expand Down Expand Up @@ -325,6 +327,13 @@ private void filterOverlappingReads(final ReadLikelihoods<Allele> likelihoods, f
if (read.allele.equals(mate.allele)) {
// keep the higher-quality read
readsToDiscard.add(read.likelihood < mate.likelihood ? read.read : mate.read);

// mark the read to indicate that its mate was dropped - so that we can account for it in {@link StrandArtifact}
// and {@link StrandBiasBySample}
if (MTAC.annotateBasedOnReads){
final GATKRead readToKeep = read.likelihood >= mate.likelihood ? read.read : mate.read;
readToKeep.setAttribute(DISCARDED_MATE_READ_TAG, 1);
}
} else if (retainMismatches) {
// keep the alt read
readsToDiscard.add(read.allele.equals(ref) ? read.read : mate.read);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ private FragmentUtils() {}
* Assumes that firstRead starts before secondRead (according to their soft clipped starts)
* Sets the qualities of clippedFirstRead and clippedSecondRead to mimic a merged read or
* nothing if the algorithm cannot create a meaningful one
*
* @param clippedFirstRead the left most read
* @param clippedFirstRead the left most read
* @param clippedSecondRead the right most read
* @param correctOverlappingBaseQualities
*
*/
public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead) {
public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead,
final boolean correctOverlappingBaseQualities) {
Utils.nonNull(clippedFirstRead);
Utils.nonNull(clippedSecondRead);
Utils.validateArg(clippedFirstRead.getName().equals(clippedSecondRead.getName()), () ->
Expand All @@ -51,6 +52,10 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe
final byte[] secondReadQuals = clippedSecondRead.getBaseQualities();

for ( int i = 0; i < numOverlappingBases; i++ ) {
if (! correctOverlappingBaseQualities) {
break;
}

final int firstReadIndex = firstReadStop + i;
final byte firstReadBase = firstReadBases[firstReadIndex];
final byte secondReadBase = secondReadBases[i];
Expand All @@ -69,17 +74,17 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe
clippedSecondRead.setBaseQualities(secondReadQuals);
}

public static void adjustQualsOfOverlappingPairedFragments( final List<GATKRead> overlappingPair ) {
public static void adjustQualsOfOverlappingPairedFragments(final List<GATKRead> overlappingPair, final boolean correctOverlappingBaseQualities) {
Utils.validateArg( overlappingPair.size() == 2, () -> "Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2.");

final GATKRead firstRead = overlappingPair.get(0);
final GATKRead secondRead = overlappingPair.get(1);

if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) {
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead);
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead, correctOverlappingBaseQualities);
}
else {
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead);
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead, correctOverlappingBaseQualities);
}
}
}
Loading

0 comments on commit d3d5f3e

Please sign in to comment.