From 5c32785bc90e59aa7c5336e9570a7d4fb4a357d0 Mon Sep 17 00:00:00 2001 From: Ilya Soifer Date: Thu, 2 May 2024 08:36:37 +0300 Subject: [PATCH] Bug fix in flow allele filtering (#8775) * Fixed a bug that prevented filtering by SOR in many cases --- .gitignore | 1 + .../haplotypecaller/AlleleFiltering.java | 13 ++- .../AlleleFilteringUnitTest.java | 94 +++++++++++++++++++ 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index e95e5ab6094..a73810708a6 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ funcotator_tmp #Test generated dot files test*.dot +.vscode/ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java index ebc0f61d3f6..3a350228128 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java @@ -29,6 +29,8 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import com.google.common.annotations.VisibleForTesting; + /** * Filtering haplotypes that contribute weak alleles to the genotyping. * @@ -278,7 +280,8 @@ private AlleleLikelihoods subsetHaplotypesByAlleles(final A * @param sorThreshold only variants with SOR above threshold will be considered * @return list of alleles that can be removed */ - private List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, + @VisibleForTesting + List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, final List alleles, final double qualThreshold, final double sorThreshold) { @@ -303,9 +306,11 @@ private List identifyBadAlleles(final List collectedRPLs, final //we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD //we will first filter the one with the lowest QUAL. logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount)); - for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) { - if (!result.contains(alleles.get(sorIndices[i]))) { - result.add(alleles.get(sorIndices[i])); + for (int i = sorIndices.length-1 ; (i >= 0) ; i--) { + if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){ + if (!result.contains(alleles.get(sorIndices[i]))) { + result.add(alleles.get(sorIndices[i])); + } } } logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount)); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringUnitTest.java index c57dcc343c0..5d91a3ea918 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringUnitTest.java @@ -1,12 +1,15 @@ package org.broadinstitute.hellbender.tools.walkers.haplotypecaller; +import com.google.common.collect.ImmutableList; import htsjdk.samtools.TextCigarCodec; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AlleleFiltering; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AlleleFilteringHC; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerGenotypingEngine; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.genotyper.*; +import org.broadinstitute.hellbender.utils.haplotype.Event; import org.broadinstitute.hellbender.utils.haplotype.EventMap; import org.broadinstitute.hellbender.utils.haplotype.Haplotype; import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; @@ -298,5 +301,96 @@ public void testNotFilterLoneWeakAllele(){ } + @Test //check that we filter strong allele with high SOR + public void testFilterDistantHindelSor() { + // create haplotypes + List haplotypeList = new ArrayList<>(); + final byte[] fullReferenceWithPadding = "CAGGCATG".getBytes(); + Haplotype haplotype = new Haplotype(fullReferenceWithPadding, true, 0, TextCigarCodec.decode("8M")); + haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 108)); + haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0)); + haplotypeList.add(haplotype); + + haplotype = new Haplotype("CAGGCATTG".getBytes(), false, 0, TextCigarCodec.decode("7M1I1M")); + haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 109)); + + haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0)); + haplotypeList.add(haplotype); + haplotype = new Haplotype("CAGGCATTTG".getBytes(), false, 0, TextCigarCodec.decode("7M2I1M")); + haplotype.setGenomeLocation(new SimpleInterval("chr", 100, 110)); + + haplotype.setEventMap(EventMap.fromHaplotype(haplotype, fullReferenceWithPadding, 0)); + haplotypeList.add(haplotype); + + AlleleList haplotypes = new IndexedAlleleList<>(haplotypeList); + SampleList samples = new IndexedSampleList(Arrays.asList("sm1")); + + List readList = new ArrayList<>(30); + Map> ebs = new HashMap<>(); + ebs.put("sm1", readList); + + for (int i = 0; i < 40; i++) { + readList.add(ArtificialReadUtils.createArtificialRead("20M")); + } + + + double[][] values = { + { 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, + 0, 3, 0, 3, 0, + 3 }, + { 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, + 3, 0, 3, 0, 3, + 0 }, + { 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0, 0, 10, 0, 0,0,10,0,0,0,10,0,0,0} + }; + for (int i = 0; i < 40; i++) { + if (i % 4 == 0) { + readList.get(i).setIsReverseStrand(true); + } + } + + AlleleLikelihoods lks = new AlleleLikelihoods<>(samples, haplotypes, ebs); + LikelihoodMatrix lkm = lks.sampleMatrix(0); + for (int i = 0; i < lks.numberOfAlleles(); i++) { + for (int j = 0; j < lkm.evidenceCount(); j++) { + lkm.set(i, j, values[i][j]); + } + } + + HaplotypeCallerArgumentCollection hcArgs = new HaplotypeCallerArgumentCollection(); + HaplotypeCallerGenotypingEngine genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samples, + !hcArgs.doNotRunPhysicalPhasing, false); + + AlleleFiltering alleleFiltering = new AlleleFilteringHC(hcArgs, null, genotypingEngine); + AlleleLikelihoods filtered_lks = alleleFiltering.filterAlleles(lks, 0, new HashSet<>()); + Assert.assertEquals(filtered_lks.alleles(), haplotypeList.subList(0, 2)); + } + + @Test + public void testIdentifyBadAlleles(){ + Event a = new Event("chr1", 10, Allele.create("A",true), Allele.create("T", false)); + Event b = new Event("chr1", 10, Allele.create("T",true), Allele.create("G", false)); + Event c = new Event("chr1", 10, Allele.create("C", true), Allele.create("G", false)); + + List events = List.of(a,b,c); + List rpls = List.of(10,20,0); + List sors = List.of(0.0,1.0,3.5); + HaplotypeCallerGenotypingEngine ge = new HaplotypeCallerGenotypingEngine(new HaplotypeCallerArgumentCollection(), + SampleList.singletonSampleList("test"), false, false); + AlleleFiltering af = new AlleleFilteringHC(null, null,ge); + List badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3); + Assert.assertEquals(badAlleles, List.of(b, a, c)); + rpls = List.of(-100, -200, 0); + sors = List.of(0.0,1.0,3.5); + badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3); + Assert.assertEquals(badAlleles, List.of(c)); + + rpls = List.of(-100, -200, -300); + sors = List.of(0.0,1.0,3.5); + badAlleles = af.identifyBadAlleles(rpls, sors, events, 30, 3); + Assert.assertEquals(badAlleles, List.of(c)); + + + } }