From 17e096d875ca924f9c415efb13a8f9b2cd97c659 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Tue, 3 Aug 2021 10:55:49 -0400 Subject: [PATCH] Fixed bug in calculation of p-values for exact test of Hardy-Weinberg equilibrium and corresponding unit tests. --- .../tools/walkers/annotator/ExcessHet.java | 31 +-- .../GnarlyGenotyperEngine.java | 2 +- .../walkers/annotator/ExcessHetUnitTest.java | 235 ++++++++++-------- 3 files changed, 144 insertions(+), 124 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java index 25ae84c47ca..e0e89f6233a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java @@ -82,16 +82,16 @@ public Map annotate(final ReferenceContext ref, static Pair calculateEH(final VariantContext vc, final GenotypesContext genotypes) { final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS); // number of samples that have likelihoods - final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isDiploidWithLikelihoods(g)).count(); + final int sampleCount = (int) genotypes.stream().filter(GenotypeUtils::isDiploidWithLikelihoods).count(); - return calculateEH(vc, t, sampleCount); + return calculateEH(t, sampleCount); } @VisibleForTesting - public static Pair calculateEH(final VariantContext vc, final GenotypeCounts t, final int sampleCount) { - final int refCount = (int)t.getRefs(); - final int hetCount = (int)t.getHets(); - final int homCount = (int)t.getHoms(); + public static Pair calculateEH(final GenotypeCounts t, final int sampleCount) { + final int refCount = (int) t.getRefs(); + final int hetCount = (int) t.getHets(); + final int homCount = (int) t.getHoms(); final double pval = exactTest(hetCount, refCount, homCount); @@ -137,11 +137,6 @@ static double exactTest(final int hetCount, final int refCount, final int homCou final int rareCopies = 2 * obsHomR + hetCount; final int N = hetCount + obsHomC + obsHomR; - //If the probability distribution has only 1 point, then the mid p-value is .5 - if (rareCopies <= 1) { - return .5; - } - final double[] probs = new double[rareCopies + 1]; //Find (something close to the) mode for the midpoint @@ -193,12 +188,12 @@ static double exactTest(final int hetCount, final int refCount, final int homCou currHomC = currHomC - 1; } - final double rightPval = probs[hetCount] / (2.0 * mysum); + final double rightPval = Math.min(1., probs[hetCount] / mysum); //Check if we observed the highest possible number of hets if (hetCount == rareCopies) { return rightPval; } - return rightPval + StatUtils.sum(Arrays.copyOfRange(probs, hetCount + 1, probs.length)) / mysum; + return Math.min(1., rightPval + StatUtils.sum(Arrays.copyOfRange(probs, hetCount + 1, probs.length)) / mysum); } @Override @@ -206,7 +201,6 @@ public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.EXCESS_HET_KEY); } - //@Override public String getRawKeyName() { return GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY; } @@ -218,8 +212,7 @@ public String getRawKeyName() { * @param vc the variant context to annotate * @param likelihoods likelihoods indexed by sample, allele, and read within sample */ - //@Override - public Map annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods likelihoods) { + public static Map annotateRawData(ReferenceContext ref, VariantContext vc, AlleleLikelihoods likelihoods) { return null; } @@ -231,8 +224,7 @@ public Map annotateRawData(ReferenceContext ref, VariantContext * @param listOfRawData The raw data for all the variants being combined/merged * @return A key, Object map containing the annotations generated by this combine operation */ - //@Override - public Map combineRawData(List allelesList, List> listOfRawData) { + public static Map combineRawData(List allelesList, List> listOfRawData) { return null; } @@ -243,7 +235,6 @@ public Map combineRawData(List allelesList, List finalizeRawData(VariantContext vc, VariantContext originalVC) { if (vc.hasAttribute(getRawKeyName())) { List counts = vc.getAttributeAsIntList(getRawKeyName(), 0); @@ -251,7 +242,7 @@ public Map finalizeRawData(VariantContext vc, VariantContext ori throw new IllegalStateException("Genotype counts for ExcessHet (" + getRawKeyName() + ") should have three values: homozygous reference, heterozygous with one ref allele, and homozygous variant/heterozygous non-reference"); } final GenotypeCounts t = new GenotypeCounts(counts.get(0), counts.get(1), counts.get(2)); - final Pair sampleCountEH = calculateEH(vc, t, counts.get(0)+counts.get(1)+counts.get(2)); + final Pair sampleCountEH = calculateEH(t, counts.get(0)+counts.get(1)+counts.get(2)); final int sampleCount = sampleCountEH.getLeft(); final double eh = sampleCountEH.getRight(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java index 60604a7ff2e..5c87cfaccc6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java @@ -225,7 +225,7 @@ else if (variant.hasAttribute(GATKVCFConstants.AS_RAW_QUAL_APPROX_KEY)) { final int refCount = Math.max(numCalledAlleles / 2 - gtCounts.get(1) - gtCounts.get(2), 0); //homRefs don't get counted properly because ref blocks aren't annotated gtCounts.set(0, refCount); - final Pair eh = ExcessHet.calculateEH(variant, new GenotypeCounts(gtCounts.get(0), gtCounts.get(1), gtCounts.get(2)), numCalledAlleles / 2); + final Pair eh = ExcessHet.calculateEH(new GenotypeCounts(gtCounts.get(0), gtCounts.get(1), gtCounts.get(2)), numCalledAlleles / 2); vcfBuilder.attribute(GATKVCFConstants.EXCESS_HET_KEY, String.format("%.4f", eh.getRight())); vcfBuilder.rmAttribute(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY); if (annotationDBBuilder != null) { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHetUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHetUnitTest.java index f4889d38764..db71e9b2289 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHetUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHetUnitTest.java @@ -1,6 +1,10 @@ package org.broadinstitute.hellbender.tools.walkers.annotator; -import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; @@ -8,19 +12,25 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; -import static org.mockito.ArgumentMatchers.refEq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public final class ExcessHetUnitTest extends GATKBaseTest { - private static double DELTA_PRECISION = .001; - private Allele Aref= Allele.create("A", true); - private Allele T = Allele.create("T"); - private Allele C = Allele.create("C"); - private int[] hetPLs = {240, 0, 240}; - private int[] homRefPLs= {0, 60, 600}; + private static final double DELTA_PRECISION = 1E-6; + private static final double DELTA_PRECISION_PARSE = 1E-3; + private static final Allele A_REF = Allele.create("A", true); + private static final Allele T = Allele.create("T"); + private static final Allele C = Allele.create("C"); + private static final int[] HET_PL_ARRAY = {240, 0, 240}; + private static final int[] HOM_REF_PL_ARRAY = {0, 60, 600}; @Override public String getToolTestDataDir() { @@ -28,13 +38,13 @@ public String getToolTestDataDir() { } - private Genotype makeG(String sample, Allele a1, Allele a2, int... pls) { + private static Genotype makeG(final String sample, final Allele a1, final Allele a2, final int... pls) { return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); } - private VariantContext makeVC(String source, List alleles, Genotype... genotypes) { - int start = 10; - int stop = start; // alleles.contains(ATC) ? start + 3 : start; + private static VariantContext makeVC(final String source, final List alleles, final Genotype... genotypes) { + final int start = 10; + final int stop = start; // alleles.contains(ATC) ? start + 3 : start; return new VariantContextBuilder(source, "1", start, stop, alleles) .genotypes(Arrays.asList(genotypes)) .unfiltered() @@ -42,125 +52,125 @@ private VariantContext makeVC(String source, List alleles, Genotype... g } @Test - public void testExcessHetForMultiallelicVC_compondHets() { + public void testExcessHetForMultiallelicVCCompoundHets() { //make sure that compound gets (with no ref) don't add to het count - VariantContext test1 = makeVC("1", Arrays.asList(Aref, T, C), - makeG("s1", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + VariantContext test1 = makeVC("1", Arrays.asList(A_REF, T, C), + makeG("s1", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s2", T, T, 7099, 2530, 0, 7099, 366, 3056), makeG("s3", T, C, 7099, 2530, 7099, 3056, 0, 14931), - makeG("s4", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s4", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s5", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s6", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s6", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s7", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s8", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s8", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s9", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s10", Aref, T, 2530, 0, 7099, 366, 3056, 14931)); + makeG("s10", A_REF, T, 2530, 0, 7099, 366, 3056, 14931)); final double result = ExcessHet.calculateEH(test1, test1.getGenotypes()).getValue(); - Assert.assertEquals(result, 5.85, DELTA_PRECISION, "Pass"); + final double expected = 2.8389324060524004; // calculated using the python implementation at https://github.com/broadinstitute/gatk/issues/7392#issue-959501711 + Assert.assertEquals(result, expected, DELTA_PRECISION, "Pass"); } @Test - public void testExcessHetForMultiallelicVC_compondHetsRefAltFlip() { + public void testExcessHetForMultiallelicVCCompoundHetsRefAltFlip() { //make sure that compound gets (with no ref) don't add to het count - VariantContext test1 = makeVC("1", Arrays.asList(Aref, T, C), - makeG("s1", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + VariantContext test1 = makeVC("1", Arrays.asList(A_REF, T, C), + makeG("s1", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s2", T, T, 7099, 2530, 0, 7099, 366, 3056), makeG("s3", T, C, 7099, 2530, 7099, 3056, 0, 14931), - makeG("s4", T, Aref, 2530, 0, 7099, 366, 3056, 14931), + makeG("s4", T, A_REF, 2530, 0, 7099, 366, 3056, 14931), makeG("s5", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s6", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s6", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s7", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s8", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s8", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), makeG("s9", T, T, 7099, 2530, 0, 7099, 366, 3056), - makeG("s10", Aref, T, 2530, 0, 7099, 366, 3056, 14931)); + makeG("s10", A_REF, T, 2530, 0, 7099, 366, 3056, 14931)); final double result = ExcessHet.calculateEH(test1, test1.getGenotypes()).getValue(); - Assert.assertEquals(result, 5.85, DELTA_PRECISION, "Pass"); + final double expected = 2.8389324060524004; // calculated using the python implementation at https://github.com/broadinstitute/gatk/issues/7392#issue-959501711 + Assert.assertEquals(result, expected, DELTA_PRECISION, "Pass"); } @Test - public void testExcessHetForMultiallelicVC_differentAlts() { + public void testExcessHetForMultiallelicVCDifferentAlts() { //make sure that hets with different alternate alleles all get counted - VariantContext test2 = makeVC("2", Arrays.asList(Aref, T, C), - makeG("s1", Aref, C, 4878, 1623, 11297, 0, 7970, 8847), - makeG("s2", Aref, T, 2530, 0, 7099, 366, 3056, 14931), - makeG("s3", Aref, T, 3382, 0, 6364, 1817, 5867, 12246), - makeG("s4", Aref, T, 2488, 0, 9110, 3131, 9374, 12505), - makeG("s5", Aref, C, 4530, 2006, 18875, 0, 6847, 23949), - makeG("s6", Aref, T, 5325, 0, 18692, 389, 16014, 24570), - makeG("s7", Aref, T, 2936, 0, 29743, 499, 21979, 38630), - makeG("s8", Aref, T, 6902, 0, 8976, 45, 5844, 9061), - makeG("s9", Aref, T, 5732, 0, 10876, 6394, 11408, 17802), - makeG("s10", Aref, T, 2780, 0, 25045, 824, 23330, 30939)); - - final double result2 = ExcessHet.calculateEH(test2, test2.getGenotypes()).getValue(); - final double result = 25.573; - Assert.assertEquals(result2, result, DELTA_PRECISION, "Pass"); + VariantContext test = makeVC("2", Arrays.asList(A_REF, T, C), + makeG("s1", A_REF, C, 4878, 1623, 11297, 0, 7970, 8847), + makeG("s2", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s3", A_REF, T, 3382, 0, 6364, 1817, 5867, 12246), + makeG("s4", A_REF, T, 2488, 0, 9110, 3131, 9374, 12505), + makeG("s5", A_REF, C, 4530, 2006, 18875, 0, 6847, 23949), + makeG("s6", A_REF, T, 5325, 0, 18692, 389, 16014, 24570), + makeG("s7", A_REF, T, 2936, 0, 29743, 499, 21979, 38630), + makeG("s8", A_REF, T, 6902, 0, 8976, 45, 5844, 9061), + makeG("s9", A_REF, T, 5732, 0, 10876, 6394, 11408, 17802), + makeG("s10", A_REF, T, 2780, 0, 25045, 824, 23330, 30939)); + + final double result = ExcessHet.calculateEH(test, test.getGenotypes()).getValue(); + final double expected = 22.562985944843152; // calculated using the python implementation at https://github.com/broadinstitute/gatk/issues/7392#issue-959501711 + Assert.assertEquals(result, expected, DELTA_PRECISION, "Pass"); //test the annotate method - final Map annots = new ExcessHet().annotate(null, test2, null); + final Map annots = new ExcessHet().annotate(null, test, null); Assert.assertEquals(annots.keySet(), Collections.singleton(GATKVCFConstants.EXCESS_HET_KEY), "annots"); Assert.assertEquals(annots.values().size(), 1, "size"); - Assert.assertEquals(Double.parseDouble((String)annots.values().iterator().next()), result, DELTA_PRECISION, "het"); + Assert.assertEquals(Double.parseDouble((String) annots.values().iterator().next()), expected, DELTA_PRECISION_PARSE, "het"); } @Test public void testFounderIDsAndPedigreeFile() { //make sure that hets with different alternate alleles all get counted - VariantContext test2 = makeVC("2", Arrays.asList(Aref, T, C), - makeG("s1", Aref, C, 4878, 1623, 11297, 0, 7970, 8847), - makeG("s2", Aref, T, 2530, 0, 7099, 366, 3056, 14931), - makeG("s3", Aref, T, 3382, 0, 6364, 1817, 5867, 12246), - makeG("s4", Aref, T, 2488, 0, 9110, 3131, 9374, 12505), - makeG("s5", Aref, C, 4530, 2006, 18875, 0, 6847, 23949), - makeG("s6", Aref, T, 5325, 0, 18692, 389, 16014, 24570), - makeG("s7", Aref, T, 2936, 0, 29743, 499, 21979, 38630), - makeG("s8", Aref, T, 6902, 0, 8976, 45, 5844, 9061), - makeG("s9", Aref, T, 5732, 0, 10876, 6394, 11408, 17802), - makeG("s10", Aref, T, 2780, 0, 25045, 824, 23330, 30939)); - - Set founderIDs = new HashSet(); - founderIDs.addAll(Arrays.asList("s1","s2","s3","s4","s5")); - - final double result2 = ExcessHet.calculateEH(test2, test2.getGenotypes(founderIDs)).getValue(); - final double result = 11.972; - Assert.assertEquals(result2, result, DELTA_PRECISION, "Pass"); + final VariantContext test = makeVC("2", Arrays.asList(A_REF, T, C), + makeG("s1", A_REF, C, 4878, 1623, 11297, 0, 7970, 8847), + makeG("s2", A_REF, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s3", A_REF, T, 3382, 0, 6364, 1817, 5867, 12246), + makeG("s4", A_REF, T, 2488, 0, 9110, 3131, 9374, 12505), + makeG("s5", A_REF, C, 4530, 2006, 18875, 0, 6847, 23949), + makeG("s6", A_REF, T, 5325, 0, 18692, 389, 16014, 24570), + makeG("s7", A_REF, T, 2936, 0, 29743, 499, 21979, 38630), + makeG("s8", A_REF, T, 6902, 0, 8976, 45, 5844, 9061), + makeG("s9", A_REF, T, 5732, 0, 10876, 6394, 11408, 17802), + makeG("s10", A_REF, T, 2780, 0, 25045, 824, 23330, 30939)); + + final Set founderIDs = new HashSet<>(Arrays.asList("s1", "s2", "s3", "s4", "s5")); + + final double result = ExcessHet.calculateEH(test, test.getGenotypes(founderIDs)).getValue(); + final double expected = 8.96250562461638; // calculated using the python implementation at https://github.com/broadinstitute/gatk/issues/7392#issue-959501711 + Assert.assertEquals(result, expected, DELTA_PRECISION, "Pass"); //test the annotate method with FounderIDs - Map annots = new ExcessHet(founderIDs).annotate(null, test2, null); + final Map annots = new ExcessHet(founderIDs).annotate(null, test, null); Assert.assertEquals(annots.keySet(), Collections.singleton(GATKVCFConstants.EXCESS_HET_KEY), "annots"); Assert.assertEquals(annots.values().size(), 1, "size"); - Assert.assertEquals(Double.parseDouble((String)annots.values().iterator().next()), result, DELTA_PRECISION, "het"); + Assert.assertEquals(Double.parseDouble((String) annots.values().iterator().next()), expected, DELTA_PRECISION_PARSE, "het"); //test the annotate method with a Pedigree File - annots = new ExcessHet(getTestFileGATKPath("testPedigree.ped")).annotate(null, test2, null); - Assert.assertEquals(annots.keySet(), Collections.singleton(GATKVCFConstants.EXCESS_HET_KEY), "annots"); - Assert.assertEquals(annots.values().size(), 1, "size"); - Assert.assertEquals(Double.parseDouble((String)annots.values().iterator().next()), result, DELTA_PRECISION, "het"); + final Map annotsPedigree = new ExcessHet(getTestFileGATKPath("testPedigree.ped")).annotate(null, test, null); + Assert.assertEquals(annotsPedigree.keySet(), Collections.singleton(GATKVCFConstants.EXCESS_HET_KEY), "annots"); + Assert.assertEquals(annotsPedigree.values().size(), 1, "size"); + Assert.assertEquals(Double.parseDouble((String) annotsPedigree.values().iterator().next()), expected, DELTA_PRECISION_PARSE, "het"); } @Test public void testSingletonVsCommonAllele() { - final List allGTs = new ArrayList<>(); final int numHomRefGTs = 10000; for (int i = 0; i < numHomRefGTs; i++) { - allGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + allGTs.add(makeG("ref" + i, A_REF, A_REF, HOM_REF_PL_ARRAY)); } - allGTs.add(makeG("het0", Aref, T, hetPLs)); + allGTs.add(makeG("het0", A_REF, T, HET_PL_ARRAY)); int numHetGTs = 1; - final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + final VariantContext singleton = makeVC("singleton", Arrays.asList(A_REF, T), allGTs.toArray(new Genotype[allGTs.size()])); final double singletonValue = ExcessHet.calculateEH(singleton, singleton.getGenotypes()).getValue(); final int targetNumHetGTs = 20; for (int i = numHetGTs; i < targetNumHetGTs; i++) { - allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + allGTs.add(makeG("het" + i, A_REF, T, HET_PL_ARRAY)); } - final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + final VariantContext common = makeVC("common", Arrays.asList(A_REF, T), allGTs.toArray(new Genotype[allGTs.size()])); final double EHcommon = ExcessHet.calculateEH(common, common.getGenotypes()).getValue(); Assert.assertTrue(Math.abs(singletonValue) < Math.abs(EHcommon), String.format("singleton=%f common=%f", singletonValue, EHcommon)); @@ -168,33 +178,32 @@ public void testSingletonVsCommonAllele() { @Test public void testLargeCohorts() { - final List allGTs = new ArrayList<>(); final int numHomRefGTs = 1000000; for (int i = 0; i < numHomRefGTs; i++) { - allGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + allGTs.add(makeG("ref" + i, A_REF, A_REF, HOM_REF_PL_ARRAY)); } - allGTs.add(makeG("het0", Aref, T, hetPLs)); + allGTs.add(makeG("het0", A_REF, T, HET_PL_ARRAY)); int numHetGTs = 1; - final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + final VariantContext singleton = makeVC("singleton", Arrays.asList(A_REF, T), allGTs.toArray(new Genotype[allGTs.size()])); final double singletonValue = ExcessHet.calculateEH(singleton, singleton.getGenotypes()).getValue(); for (int i = numHetGTs; i < 100; i++) { - allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + allGTs.add(makeG("het" + i, A_REF, T, HET_PL_ARRAY)); numHetGTs++; } - final VariantContext hundredton = makeVC("hundredton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + final VariantContext hundredton = makeVC("hundredton", Arrays.asList(A_REF, T), allGTs.toArray(new Genotype[allGTs.size()])); final double hundredtonValue = ExcessHet.calculateEH(hundredton, hundredton.getGenotypes()).getValue(); Assert.assertTrue(Math.abs(singletonValue) < Math.abs(hundredtonValue), String.format("singleton=%f hundredton=%f", singletonValue, hundredtonValue)); for (int i = numHetGTs; i < numHomRefGTs; i++) - allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + allGTs.add(makeG("het" + i, A_REF, T, HET_PL_ARRAY)); - final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + final VariantContext common = makeVC("common", Arrays.asList(A_REF, T), allGTs.toArray(new Genotype[allGTs.size()])); final double commonValue = ExcessHet.calculateEH(common, common.getGenotypes()).getValue(); Assert.assertTrue(Math.abs(hundredtonValue) < Math.abs(commonValue), String.format("hundredton=%f common=%f", hundredtonValue, commonValue)); @@ -202,52 +211,73 @@ public void testLargeCohorts() { @Test public void testAllHetsForLargeCohorts() { - final int numGTs = 1000000; final List singletonGTs = new ArrayList<>(); for (int i = 0; i < numGTs; i++) { - singletonGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + singletonGTs.add(makeG("ref" + i, A_REF, A_REF, HOM_REF_PL_ARRAY)); } - singletonGTs.add(makeG("het0", Aref, T, hetPLs)); + singletonGTs.add(makeG("het0", A_REF, T, HET_PL_ARRAY)); - final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), singletonGTs.toArray(new Genotype[singletonGTs.size()])); + final VariantContext singleton = makeVC("singleton", Arrays.asList(A_REF, T), singletonGTs.toArray(new Genotype[singletonGTs.size()])); final double singletonValue = ExcessHet.calculateEH(singleton, singleton.getGenotypes()).getValue(); final List allHetGTs = new ArrayList<>(); for (int i = 0; i < numGTs; i++) { - allHetGTs.add(makeG("het" + i, Aref, T, hetPLs)); + allHetGTs.add(makeG("het" + i, A_REF, T, HET_PL_ARRAY)); } - final VariantContext allHet = makeVC("allHet", Arrays.asList(Aref, T), allHetGTs.toArray(new Genotype[allHetGTs.size()])); + final VariantContext allHet = makeVC("allHet", Arrays.asList(A_REF, T), allHetGTs.toArray(new Genotype[allHetGTs.size()])); final double hetsValue = ExcessHet.calculateEH(allHet, allHet.getGenotypes()).getValue(); Assert.assertTrue(Math.abs(singletonValue) < Math.abs(hetsValue), String.format("singleton=%f allHets=%f", singletonValue, hetsValue)); //Since all hets is such an extreme case and the sample size is large here, we know that the p-value should be 0 - Assert.assertEquals(hetsValue, ExcessHet.PHRED_SCALED_MIN_P_VALUE, DELTA_PRECISION, String.format("P-value of 0 should be phred scaled to " + ExcessHet.PHRED_SCALED_MIN_P_VALUE)); + Assert.assertEquals(hetsValue, ExcessHet.PHRED_SCALED_MIN_P_VALUE, DELTA_PRECISION, "P-value of 0 should be phred scaled to " + ExcessHet.PHRED_SCALED_MIN_P_VALUE); } @DataProvider(name = "smallSets") public Object[][] counts() { return new Object[][]{ - {1, 0, 0, 0.5}, - {1, 1, 0, 0.5}, - {1, 1, 1, 0.7}, - {4, 0, 0, 0.114}, - {2, 1, 1, 0.571}, - {0, 2, 2, 0.957}, - {1, 1, 40, 0.982}, - {3, 0, 39, 0.482}, + {1, 0, 0, 1.}, + {1, 1, 0, 1.}, + // The following test cases are taken from Table 1 of Wigginton et al. + // at https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1199378/. + // Results were calculated using the python implementation at + // https://github.com/broadinstitute/gatk/issues/7392#issue-959501711, + // which explicitly calculates and sums the likelihoods rather than using the recurrence relation, + // and checked against the values in the table. See further comments in the issue. + {5, 87, 8, 0.9999999998661845}, + {17, 81, 2, 0.9304235455950692}, + {21, 79, 0, 0.3096036754839548}, + // The following test cases were also generated using the python implementation. + {707, 359, 9, 0.}, + {804, 599, 70, 0.}, + {684, 559, 629, 1.}, + {192, 835, 763, 1.}, + {723, 277, 754, 0.9999982776347127}, // for testing to DELTA_PRECISION = 1E-6 + {537, 845, 72, 0.14763417247333427}, + {847, 431, 448, 0.7957628300988617}, + {659, 147, 910, 0.9664916975760656}, + {633, 938, 84, 0.04808183902489469}, + {881, 690, 292, 0.6731260236412528}, + {554, 371, 184, 0.1930042049715512}, + {552, 438, 207, 0.9367121245319978}, + {886, 812, 216, 0.1488013326311986}, + {537, 845, 72, 0.14763417247333427}, + {8700, 3186, 5918, 0.46111947799744585} }; } @Test(dataProvider = "smallSets") - public void smallSets(int hetCount, int homrefCount, int homvarCount, double expected) { + public void smallSets(final int hetCount, final int homrefCount, final int homvarCount, final double expected) { final double actual = ExcessHet.exactTest(hetCount, homrefCount, homvarCount); Assert.assertEquals(actual, expected, DELTA_PRECISION, "Pass"); + // we also test swapping homvarCount <-> homrefCount, which should be symmetric + final double actualSymmetric = ExcessHet.exactTest(hetCount, homvarCount, homrefCount); + Assert.assertEquals(actualSymmetric, expected, DELTA_PRECISION, "Pass"); } @DataProvider(name = "illegalArgsForExactTest") @@ -271,10 +301,9 @@ public void testLabels(){ } @Test - public void testEmptyIfNoGenotypes() throws Exception { + public void testEmptyIfNoGenotypes() { final ExcessHet ann = new ExcessHet(); final Map annotate = ann.annotate(null, when(mock(VariantContext.class).getGenotypesOrderedByName()).thenReturn(Collections.emptyList()).getMock(), null); Assert.assertTrue(annotate.isEmpty()); } - }