From eebcb6e33062eed340dcc4cebfb7f6a42d6fe298 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Thu, 11 Oct 2018 14:56:04 -0400 Subject: [PATCH] 1. Add an option to count reads, not fragments, in strand bias annotations. 2. Add an option to turn off base quality correction before local reassembly (#5286) --- .../walkers/annotator/StrandArtifact.java | 43 ++++++++-- .../walkers/annotator/StrandBiasTest.java | 11 ++- ...AssemblyBasedCallerArgumentCollection.java | 4 + .../AssemblyBasedCallerUtils.java | 13 +-- .../HaplotypeCallerEngine.java | 6 +- .../walkers/mutect/M2ArgumentCollection.java | 9 +++ .../tools/walkers/mutect/Mutect2Engine.java | 2 +- .../mutect/SomaticGenotypingEngine.java | 13 ++- .../utils/fragments/FragmentUtils.java | 17 ++-- .../AssemblyBasedCallerUtilsUnitTest.java | 3 +- .../tools/walkers/mutect/M2TestingUtils.java | 22 +++-- .../mutect/Mutect2IntegrationTest.java | 80 ++++++++++++++++++- .../CollectF1R2CountsIntegrationTest.java | 10 +-- .../fragments/FragmentUtilsUnitTest.java | 2 +- 14 files changed, 185 insertions(+), 50 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandArtifact.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandArtifact.java index 68966a08ea3..8fef02e4cab 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandArtifact.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandArtifact.java @@ -9,6 +9,7 @@ import org.apache.commons.math3.distribution.BetaDistribution; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine; import org.broadinstitute.hellbender.utils.*; import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods; import org.broadinstitute.hellbender.utils.help.HelpConstants; @@ -16,6 +17,7 @@ import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.util.*; +import java.util.stream.Collectors; import static org.broadinstitute.hellbender.tools.walkers.annotator.StrandArtifact.ArtifactState.*; @@ -88,13 +90,22 @@ public void annotate(final ReferenceContext ref, return; } final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods); - final Allele altAlelle = vc.getAlternateAllele(indexOfMaxTumorLod); - - final Collection.BestAllele> bestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName()); - final int numFwdAltReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count(); - final int numRevAltReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count(); - final int numFwdReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative()).count(); - final int numRevReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative()).count(); + final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod); + + final Collection.BestAllele> informativeBestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName()).stream(). + filter(ba -> ba.isInformative()).collect(Collectors.toList()); + final Map.BestAllele>> altReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(altAllele)) + .collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD)); + final Map.BestAllele>> refReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(vc.getReference())) + .collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD)); + + final int numFwdAltReads = countReads(altReads, Strand.FWD); + final int numRevAltReads = countReads(altReads, Strand.REV); + final int numFwdRefReads = countReads(refReads, Strand.FWD); + final int numRevRefReads = countReads(refReads, Strand.REV); + + final int numFwdReads = numFwdRefReads + numFwdAltReads; + final int numRevReads = numRevRefReads + numRevAltReads; final int numAltReads = numFwdAltReads + numRevAltReads; final int numReads = numFwdReads + numRevReads; @@ -162,6 +173,10 @@ public void annotate(final ReferenceContext ref, gb.attribute(MAP_ALLELE_FRACTIONS_KEY, estimatedAlleleFractions.values().stream().mapToDouble(Double::doubleValue).toArray()); } + private enum Strand { + FWD, REV + } + @Override public List getDescriptions() { return Arrays.asList(new VCFFormatHeaderLine(POSTERIOR_PROBABILITIES_KEY, 3, VCFHeaderLineType.Float, "posterior probabilities of the presence of strand artifact"), @@ -177,5 +192,19 @@ private double getIntegrandGivenArtifact(final double f, final double epsilon, f MathUtils.binomialProbability(nNoArtifact, xNoArtifact, f); } + /** + * + * Count the number of reads in {@param reads}, including the overlapping mates discarded by {@link SomaticGenotypingEngine}. + * @param strand is the strand of the reads we're counting. For example, if strand = Strand.FWD, the we look among {@param reads} + * for reverse reads with {@link SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG} set to 1 + */ + private int countReads(final Map.BestAllele>> reads, final Strand strand){ + final Strand strandOfKeptRead = strand == Strand.FWD ? Strand.REV : Strand.FWD; + // check of the key in case there are no forward or reverse in {@link reads} + final int numDiscardedReads = ! reads.containsKey(strandOfKeptRead) ? 0 : (int) reads.get(strandOfKeptRead).stream().filter(ba -> ba.read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)).count(); + final int numReads = ! reads.containsKey(strand) ? 0 : reads.get(strand).size(); + return numReads + numDiscardedReads; + + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java index 9f8c68ed7f1..2697b2c6666 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/StrandBiasTest.java @@ -6,6 +6,7 @@ import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods; import org.broadinstitute.hellbender.utils.pileup.PileupElement; @@ -170,8 +171,14 @@ private static void updateTable(final int[] table, final Allele allele, final GA final int offset = matchesRef ? 0 : ARRAY_DIM; // a normal read with an actual strand - final boolean isFW = !read.isReverseStrand(); - table[offset + (isFW ? 0 : 1)]++; + final boolean isForward = !read.isReverseStrand(); + table[offset + (isForward ? 0 : 1)]++; + + // This read's mate got discarded by SomaticGenotypingEngine::clipOverlappingReads() + if (read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)){ + // Note that if this read is forward, then we increment its mate which we assume is reverse + table[offset + (isForward ? 1 : 0)]++; + } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java index 6a298e695be..aece2f33e70 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java @@ -24,6 +24,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score"; public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman"; + public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality"; @ArgumentCollection public AssemblyRegionTrimmerArgumentCollection assemblyRegionTrimmerArgs = new AssemblyRegionTrimmerArgumentCollection(); @@ -123,4 +124,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall @Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true) public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA; + @Argument(fullName = CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME) + public boolean doNotCorrectOverlappingBaseQualities = false; + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java index c390d31045c..ee39300bd38 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java @@ -64,7 +64,8 @@ public static void finalizeRegion(final AssemblyRegion region, final boolean dontUseSoftClippedBases, final byte minTailQuality, final SAMFileHeader readsHeader, - final SampleList samplesList) { + final SampleList samplesList, + final boolean correctOverlappingBaseQualities) { if ( region.isFinalized() ) { return; } @@ -98,7 +99,7 @@ public static void finalizeRegion(final AssemblyRegion region, Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader)); // TODO: sort may be unnecessary here // handle overlapping read pairs from the same fragment - cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader); + cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, correctOverlappingBaseQualities); region.clearReads(); region.addAll(readsToUse); @@ -109,12 +110,14 @@ public static void finalizeRegion(final AssemblyRegion region, * Clean up reads/bases that overlap within read pairs * * @param reads the list of reads to consider + * @param correctOverlappingBaseQualities */ - private static void cleanOverlappingReadPairs(final List reads, final SampleList samplesList, final SAMFileHeader readsHeader) { + private static void cleanOverlappingReadPairs(final List reads, final SampleList samplesList, final SAMFileHeader readsHeader, + final boolean correctOverlappingBaseQualities) { for ( final List perSampleReadList : splitReadsBySample(samplesList, readsHeader, reads).values() ) { final FragmentCollection fragmentCollection = FragmentCollection.create(perSampleReadList); for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) { - FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); + FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair, correctOverlappingBaseQualities); } } } @@ -238,7 +241,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region, final ReferenceSequenceFile referenceReader, final ReadThreadingAssembler assemblyEngine, final SmithWatermanAligner aligner){ - finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList); + finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList, ! argumentCollection.doNotCorrectOverlappingBaseQualities); if( argumentCollection.debug) { logger.info("Assembling " + region.getSpan() + " with " + region.size() + " reads: (with overlap region = " + region.getExtendedSpan() + ")"); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 960da7dff5d..8f8546a14db 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -673,7 +673,7 @@ private List referenceModelForNoVariation(final AssemblyRegion r //TODO - why the activeRegion cannot manage its own one-time finalization and filtering? //TODO - perhaps we can remove the last parameter of this method and the three lines bellow? if ( needsToBeFinalized ) { - finalizeRegion(region); + AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList, ! hcArgs.doNotCorrectOverlappingBaseQualities); } filterNonPassingReads(region); @@ -723,10 +723,6 @@ public void shutdown() { } - private void finalizeRegion(final AssemblyRegion region) { - AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList); - } - private Set filterNonPassingReads( final AssemblyRegion activeRegion ) { // TODO: can we unify this additional filtering with makeStandardHCReadFilter()? diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index c7f8fa2ca53..b70cdb0fe7c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -37,6 +37,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts"; public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors"; public static final String GET_AF_FROM_AD_LONG_NAME = "get-af-from-ad"; + public static final String ANNOTATE_BASED_ON_READS_LONG_NAME = "annotate-fragments"; public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8; public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6; @@ -169,4 +170,12 @@ public double getDefaultAlleleFrequency() { @Argument(fullName = GET_AF_FROM_AD_LONG_NAME, doc="Use allelic depth to calculate tumor allele fraction; recommended for mitochondrial applications", optional = true) public boolean calculateAFfromAD = false; + /** + * If set to true, count an overlapping read pair as two separate reads instead of one for {@link StrandArtifact} and {@link StrandBiasBySample} annotations, + * which is the correct behavior for these annotations. Note that doing so would break the independence assumption of reads and over-count the alt depth in these annotations. + * On the other hand it could prevent spurious false negatives that could arise if by chance one strand in overlapping pairs is dropped disproportionately + */ + @Argument(fullName = ANNOTATE_BASED_ON_READS_LONG_NAME, doc = "If true, strand-based annotations use the number of reads, not fragments") + public boolean annotateBasedOnReads = false; + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java index d5269ee92b0..5169dc8e2d9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java @@ -190,7 +190,7 @@ public List callRegion(final AssemblyRegion originalAssemblyRegi final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header); final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance); - final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents); + final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion, allVariationEvents); if (!trimmingResult.isVariationPresent()) { return NO_CALLS; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java index 2e6a7a79fce..e79c1d7135c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java @@ -11,6 +11,7 @@ import org.apache.commons.math3.util.FastMath; import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.annotator.StrandBiasTest; import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerGenotypingEngine; @@ -37,6 +38,7 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine public final String tumorSample; private final String normalSample; final boolean hasNormal; + public static final String DISCARDED_MATE_READ_TAG = "DM"; // {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() { @@ -226,8 +228,8 @@ private PerAlleleCollection somaticLog10Odds(final LikelihoodMatrix tumorLog10Matrix, - final Optional> normalLog10Matrix, - final VariantContextBuilder callVcb) { + final Optional> normalLog10Matrix, + final VariantContextBuilder callVcb) { final double[] tumorAlleleCounts = getEffectiveCounts(tumorLog10Matrix); final int[] adArray = Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray(); final int dp = (int) MathUtils.sum(adArray); @@ -325,6 +327,13 @@ private void filterOverlappingReads(final ReadLikelihoods likelihoods, f if (read.allele.equals(mate.allele)) { // keep the higher-quality read readsToDiscard.add(read.likelihood < mate.likelihood ? read.read : mate.read); + + // mark the read to indicate that its mate was dropped - so that we can account for it in {@link StrandArtifact} + // and {@link StrandBiasBySample} + if (MTAC.annotateBasedOnReads){ + final GATKRead readToKeep = read.likelihood >= mate.likelihood ? read.read : mate.read; + readToKeep.setAttribute(DISCARDED_MATE_READ_TAG, 1); + } } else if (retainMismatches) { // keep the alt read readsToDiscard.add(read.allele.equals(ref) ? read.read : mate.read); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtils.java index fac859e2cf1..22a4c5be0b1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtils.java @@ -25,12 +25,13 @@ private FragmentUtils() {} * Assumes that firstRead starts before secondRead (according to their soft clipped starts) * Sets the qualities of clippedFirstRead and clippedSecondRead to mimic a merged read or * nothing if the algorithm cannot create a meaningful one - * - * @param clippedFirstRead the left most read + * @param clippedFirstRead the left most read * @param clippedSecondRead the right most read + * @param correctOverlappingBaseQualities * */ - public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead) { + public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead, + final boolean correctOverlappingBaseQualities) { Utils.nonNull(clippedFirstRead); Utils.nonNull(clippedSecondRead); Utils.validateArg(clippedFirstRead.getName().equals(clippedSecondRead.getName()), () -> @@ -51,6 +52,10 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe final byte[] secondReadQuals = clippedSecondRead.getBaseQualities(); for ( int i = 0; i < numOverlappingBases; i++ ) { + if (! correctOverlappingBaseQualities) { + break; + } + final int firstReadIndex = firstReadStop + i; final byte firstReadBase = firstReadBases[firstReadIndex]; final byte secondReadBase = secondReadBases[i]; @@ -69,17 +74,17 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe clippedSecondRead.setBaseQualities(secondReadQuals); } - public static void adjustQualsOfOverlappingPairedFragments( final List overlappingPair ) { + public static void adjustQualsOfOverlappingPairedFragments(final List overlappingPair, final boolean correctOverlappingBaseQualities) { Utils.validateArg( overlappingPair.size() == 2, () -> "Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); final GATKRead firstRead = overlappingPair.get(0); final GATKRead secondRead = overlappingPair.get(1); if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) { - adjustQualsOfOverlappingPairedFragments(secondRead, firstRead); + adjustQualsOfOverlappingPairedFragments(secondRead, firstRead, correctOverlappingBaseQualities); } else { - adjustQualsOfOverlappingPairedFragments(firstRead, secondRead); + adjustQualsOfOverlappingPairedFragments(firstRead, secondRead, correctOverlappingBaseQualities); } } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java index 793c45d619a..91c52a79263 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java @@ -16,7 +16,6 @@ import org.broadinstitute.hellbender.GATKBaseTest; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class AssemblyBasedCallerUtilsUnitTest extends GATKBaseTest { @@ -53,7 +52,7 @@ public void testfinalizeRegion() { activeRegion.addAll(reads); SampleList sampleList = SampleList.singletonSampleList("tumor"); Byte minbq = 9; - AssemblyBasedCallerUtils.finalizeRegion(activeRegion, false, false, minbq, header, sampleList); + AssemblyBasedCallerUtils.finalizeRegion(activeRegion, false, false, minbq, header, sampleList, false); // make sure reads are not changed due to finalizeRegion() Assert.assertTrue(reads.get(0).convertToSAMRecord(header).equals(orgRead0)); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2TestingUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2TestingUtils.java index d1dbc640d38..8d59935faf0 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2TestingUtils.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2TestingUtils.java @@ -13,7 +13,7 @@ import java.util.List; public class M2TestingUtils { - private static final String DEFAULT_READ_GROUP_NAME = "READ_GROUP_1"; + public static final String DEFAULT_READ_GROUP_NAME = "READ_GROUP_1"; /** * * Reference at this position looks like this: @@ -32,25 +32,31 @@ public class M2TestingUtils { public static final byte[] DEFAULT_REF_BASES = "CATCACACTCACTAAGCACACAGAGAATAAT".getBytes(); // Bases for C->T SNP at position 100,000 * public static final byte[] DEFAULT_ALT_BASES = "CATCACACTTACTAAGCACACAGAGAATAAT".getBytes(); + public static final int DEFAULT_START_POSITION = 99_991; + public static final int DEFAULT_END_POSITION = 100_021; public static final int DEFAULT_SNP_POSITION = 100_000; + public static final int DEFAULT_READ_LENGTH = DEFAULT_REF_BASES.length; public static final String DEFAULT_SAMPLE_NAME = "sample1"; public static SAMFileGATKReadWriter getBareBonesSamWriter(final File samFile, final SAMFileHeader samHeader) { return new SAMFileGATKReadWriter(ReadUtils.createCommonSAMWriter(samFile, null, samHeader, true, true, false)); } - // TODO: might have to address read name collisions + public static byte[] getUniformBQArray(final byte quality, final int readLength){ + final byte[] quals = new byte[readLength]; + Arrays.fill(quals, quality); + return quals; + } + public static List createReads(final int numReads, final byte[] bases, final SAMFileHeader samHeader, - final byte baseQuality){ + final byte baseQuality, final String namePrefix){ final int readLength = bases.length; - final byte[] quals = new byte[readLength]; - Arrays.fill(quals, baseQuality); final List reads = new ArrayList<>(numReads); for (int i = 0; i < numReads; i++) { final String cigar = bases.length + "M"; - final GATKRead read = ArtificialReadUtils.createArtificialRead(samHeader, "Read" + i, DEFAULT_CHROM_INDEX, - DEFAULT_ALIGNMENT_START, bases, quals, cigar); + final GATKRead read = ArtificialReadUtils.createArtificialRead(samHeader, namePrefix + i, DEFAULT_CHROM_INDEX, + DEFAULT_ALIGNMENT_START, bases, getUniformBQArray(baseQuality, readLength), cigar); read.setReadGroup(DEFAULT_READ_GROUP_NAME); read.setMappingQuality(DEFAULT_MAPQ); read.setIsFirstOfPair(); @@ -62,7 +68,7 @@ public static List createReads(final int numReads, final byte[] bases, } public static List createReads(final int numReads, final byte[] bases, final SAMFileHeader samHeader){ - return createReads(numReads, bases, samHeader, DEFAULT_BASEQ); + return createReads(numReads, bases, samHeader, DEFAULT_BASEQ, "read"); } public static SAMFileHeader createSamHeader(){ diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java index ce8cc08407c..b13b1c57677 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java @@ -11,18 +11,23 @@ import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.Main; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.AssemblyRegionWalker; import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.broadinstitute.hellbender.tools.exome.orientationbiasvariantfilter.OrientationBiasUtils; +import org.broadinstitute.hellbender.tools.walkers.annotator.StrandBiasBySample; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.validation.ConcordanceSummaryRecord; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -583,11 +588,11 @@ public void testBaseQualityFilter() throws IOException { final byte poorQuality = 10; final byte goodQuality = 30; final int numReads = 20; - final List refReads = M2TestingUtils.createReads(numReads, M2TestingUtils.DEFAULT_REF_BASES, samHeader, poorQuality); - final List alt1Reads = M2TestingUtils.createReads(numReads, M2TestingUtils.DEFAULT_ALT_BASES, samHeader, goodQuality); + final List refReads = M2TestingUtils.createReads(numReads, M2TestingUtils.DEFAULT_REF_BASES, samHeader, poorQuality, "ref"); + final List altReads = M2TestingUtils.createReads(numReads, M2TestingUtils.DEFAULT_ALT_BASES, samHeader, goodQuality, "alt"); refReads.forEach(writer::addRead); - alt1Reads.forEach(writer::addRead); + altReads.forEach(writer::addRead); writer.close(); // closing the writer writes to the file // End creating sam file @@ -612,6 +617,75 @@ public void testBaseQualityFilter() throws IOException { Assert.assertFalse(vc.get().getFilters().contains(GATKVCFConstants.MEDIAN_BASE_QUALITY_FILTER_NAME)); } + public File createSamWithOverlappingReads(final int numAltPairs, final int refDepth) throws IOException { + final byte altQuality = 50; + final byte refQuality = 30; + final File samFile = File.createTempFile("liquid-biopsy", ".bam"); + final SAMFileHeader samHeader = M2TestingUtils.createSamHeader(); + final SAMFileGATKReadWriter writer = M2TestingUtils.getBareBonesSamWriter(samFile, samHeader); + + final List refReads = M2TestingUtils.createReads(refDepth, M2TestingUtils.DEFAULT_REF_BASES, samHeader, refQuality, "ref"); + refReads.forEach(writer::addRead); + for (int i = 0; i < numAltPairs; i++){ + // Create a read pair that completely overlap each other, which is not realistic but is easy to implement + // and captures the essence of the issue + final List overlappingPair = ArtificialReadUtils.createPair(samHeader, "alt" + i, M2TestingUtils.DEFAULT_READ_LENGTH, + M2TestingUtils.DEFAULT_START_POSITION, M2TestingUtils.DEFAULT_START_POSITION, true, false); + overlappingPair.forEach(read -> { + read.setReadGroup(M2TestingUtils.DEFAULT_READ_GROUP_NAME); + read.setMappingQuality(60); + read.setBases(M2TestingUtils.DEFAULT_ALT_BASES); + read.setBaseQualities(M2TestingUtils.getUniformBQArray(altQuality, M2TestingUtils.DEFAULT_READ_LENGTH)); + writer.addRead(read); + }); + } + + writer.close(); // closing the writer writes to the file + return samFile; + } + + + // Test that the strand bias annotaitons can count the number of reads, not fragments, when requested + @Test + public void testReadBasedAnnotations() throws IOException { + // Case 1: with the read correction we lose the variant - blood biopsy-like case + final int numAltPairs = 5; + final int depth = 100; + final int refDepth = depth - 2*numAltPairs; + final File samFileWithOverlappingReads = createSamWithOverlappingReads(numAltPairs, refDepth); + + final File unfilteredVcf = File.createTempFile("unfiltered", ".vcf"); + final File bamout = File.createTempFile("realigned", ".bam"); + final String[] args = makeCommandLineArgs(Arrays.asList( + "-R", hg19_chr1_1M_Reference, + "-I", samFileWithOverlappingReads.getAbsolutePath(), + "-tumor", M2TestingUtils.DEFAULT_SAMPLE_NAME, + "-O", unfilteredVcf.getAbsolutePath(), + "--bamout", bamout.getAbsolutePath(), + "--" + M2ArgumentCollection.ANNOTATE_BASED_ON_READS_LONG_NAME, "true", + "--annotation", StrandBiasBySample.class.getSimpleName(), + "--" + AssemblyRegionWalker.MAX_STARTS_LONG_NAME, String.valueOf(depth)), Mutect2.class.getSimpleName()); + new Main().instanceMain(args); + + final Optional vc = VariantContextTestUtils.streamVcf(unfilteredVcf).findAny(); + Assert.assertTrue(vc.isPresent()); + + // Test case 2: we lose strand artifact. Make sure to reproduce the error and so on + final Genotype g = vc.get().getGenotype(M2TestingUtils.DEFAULT_SAMPLE_NAME); + final int[] contingencyTable = GATKProtectedVariantContextUtils.getAttributeAsIntArray(g, GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, () -> null, -1); + + final int REF_FWD_INDEX = 0; + final int REF_REV_INDEX = 1; + final int ALT_FWD_INDEX = 2; + final int ALT_REV_INDEX = 3; + Assert.assertEquals(contingencyTable[REF_FWD_INDEX], refDepth/2); + Assert.assertEquals(contingencyTable[REF_REV_INDEX], refDepth/2); + Assert.assertEquals(contingencyTable[ALT_FWD_INDEX], numAltPairs); + Assert.assertEquals(contingencyTable[ALT_REV_INDEX], numAltPairs); + + Assert.assertFalse(vc.get().getFilters().contains(GATKVCFConstants.STRAND_ARTIFACT_FILTER_NAME)); + } + private void doMutect2Test( final String inputBam, final String tumorSample, diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2CountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2CountsIntegrationTest.java index 8a4ab5c80e8..18f7ee63658 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2CountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2CountsIntegrationTest.java @@ -2,18 +2,13 @@ import com.google.common.collect.ImmutableMap; import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.IOUtil; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.tools.walkers.mutect.M2TestingUtils; -import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; -import org.broadinstitute.hellbender.utils.genotyper.SampleList; -import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; import org.testng.Assert; import org.testng.annotations.Test; @@ -21,7 +16,6 @@ import java.io.File; import java.io.IOException; import java.io.Reader; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Optional; @@ -195,8 +189,8 @@ private File createSyntheticSam(final int refDepth, final int altDepth) throws I // Ref Sequence: "CATCACACTCACTAAGCACACAGAGAATAAT".getBytes(); // SNPs positions: * * * * final byte[] altReadBases = "CATCACACTCTCTGACCAAACAGAGAATAAT".getBytes(); - final List refReads = M2TestingUtils.createReads(refDepth, M2TestingUtils.DEFAULT_REF_BASES, samHeader, (byte)30); - final List alt1Reads = M2TestingUtils.createReads(altDepth, altReadBases, samHeader, (byte)30); + final List refReads = M2TestingUtils.createReads(refDepth, M2TestingUtils.DEFAULT_REF_BASES, samHeader, (byte)30, "ref"); + final List alt1Reads = M2TestingUtils.createReads(altDepth, altReadBases, samHeader, (byte)30, "alt"); refReads.forEach(writer::addRead); alt1Reads.forEach(writer::addRead); writer.close(); // closing the writer writes to the file diff --git a/src/test/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtilsUnitTest.java index 89048994f0f..5577f70899d 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/fragments/FragmentUtilsUnitTest.java @@ -60,7 +60,7 @@ public Object[][] createAdjustFragmentsTest() throws Exception { @Test(dataProvider = "AdjustFragmentsTest") public void testAdjustingTwoReads(final GATKRead read1, final GATKRead read2, final int overlapSize) { - FragmentUtils.adjustQualsOfOverlappingPairedFragments(read1, read2); + FragmentUtils.adjustQualsOfOverlappingPairedFragments(read1, read2, true); for ( int i = 0; i < read1.getLength() - overlapSize; i++ ) { Assert.assertEquals(read1.getBaseQualities()[i], HIGH_QUALITY);