Skip to content

Commit

Permalink
Added regularization to covariance in GMM maximization step to fix co…
Browse files Browse the repository at this point in the history
…nvergence issues in VQSR. (#7709)

* Added regularization to covariance in GMM maximization step to fix convergence issues in VQSR.

* Fixed exact-match tests.

* Changed disabled test for convergence failure to regression test.

* Added note about updating GATK3 results.
  • Loading branch information
samuelklee authored Mar 10, 2022
1 parent 241d13a commit b14f810
Show file tree
Hide file tree
Showing 23 changed files with 66 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class MultivariateGaussian {
private Matrix cachedSigmaInverse;
final private double[] pVarInGaussian;
int pVarInGaussianIndex;
final private static double EPSILON = 1e-200;
private static final double EPSILON = 1e-200;
private static final double COVARIANCE_REGULARIZATION_EPSILON = 1E-6;

public MultivariateGaussian( final int numVariants, final int numAnnotations ) {
mu = new double[numAnnotations];
Expand Down Expand Up @@ -187,9 +188,9 @@ public void maximizeGaussian(final List<VariantDatum> data, final double[] empir
for( final VariantDatum datum : data ) {
final double prob = pVarInGaussian[datumIndex++];
for( int iii = 0; iii < mu.length; iii++ ) {
double deltaMu = prob * (datum.annotations[iii]-mu[iii]);
for( int jjj = 0; jjj < mu.length; jjj++ ) {
pVarSigma.set(iii, jjj, deltaMu * (datum.annotations[jjj]-mu[jjj]));
final double regCovar = iii == jjj ? COVARIANCE_REGULARIZATION_EPSILON : 0.;
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]) + regCovar);
}
}
sigma.plusEquals( pVarSigma );
Expand Down Expand Up @@ -228,7 +229,8 @@ public void evaluateFinalModelParameters( final List<VariantDatum> data ) {
final double prob = pVarInGaussian[datumIndex++];
for( int iii = 0; iii < mu.length; iii++ ) {
for( int jjj = 0; jjj < mu.length; jjj++ ) {
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]));
final double regCovar = iii == jjj ? COVARIANCE_REGULARIZATION_EPSILON : 0.;
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]) + regCovar);
}
}
sigma.plusEquals( pVarSigma );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class GatherTranchesIntegrationTest extends CommandLineProgramTest {
@Test
public void testCombine2Shards() throws Exception {
final File recal1 = new File(testDir + "snpTranches.scattered.txt"); //this is the output of VariantRecalibratorIntegrationTest.testVariantRecalibratorSNPscattered
final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is the output of running the equivalent commandline as above outside of the integration test :-/
final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is a copy of the above

final File recal_original = new File(testDir + "expected/snpTranches.gathered.txt");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
* from the command line, in order to ensure that the initial state of the random number generator is the same
* between versions. In all cases, the variants in the expected files are identical to those produced by GATK3,
* though the VCF headers were hand modified to account for small differences in the metadata lines.
*
* UPDATE: The expected results from GATK3 were updated in https://github.com/broadinstitute/gatk/pull/7709.
* However, we left the original comments referencing GATK3 below untouched.
*/
public class VariantRecalibratorIntegrationTest extends CommandLineProgramTest {

Expand Down Expand Up @@ -496,8 +499,14 @@ public void testVariantRecalibratorSNPscattered(final String[] params) throws IO
doSNPTest(params, getLargeVQSRTestDataDir() + "/snpTranches.scattered.txt", getLargeVQSRTestDataDir() + "snpRecal.vcf"); //tranches file isn't in the expected/ directory because it's input to GatherTranchesIntegrationTest
}

//One of the Gaussians has a covariance matrix with a determinant of zero, (can be confirmed that the entries of sigma for the row and column with the index of the constant annotation are zero) which leads to all +Inf LODs if we don't throw
@Test(expectedExceptions={UserException.VQSRPositiveModelFailure.class})
// One of the Gaussians has a covariance matrix with a determinant of zero,
// (can be confirmed that the entries of sigma for the row and column with the index of the constant annotation are zero)
// which leads to all +Inf LODs if we don't throw.
//
// UPDATE: Originally, this test checked for expectedExceptions = {UserException.VQSRPositiveModelFailure.class}.
// However, the convergence failure was fixed in https://github.com/broadinstitute/gatk/pull/7709,
// so we now expect the test to complete and can instead treat it as a regression test.
@Test
public void testAnnotationsWithNoVarianceSpecified() throws IOException {
// use an ArrayList - ArgumentBuilder tokenizes using the "=" in the resource args
List<String> args = new ArrayList<>(alleleSpecificVQSRParamsTooManyAnnotations.length);
Expand All @@ -511,7 +520,7 @@ public void testAnnotationsWithNoVarianceSpecified() throws IOException {
Assert.assertEquals(varRecalTool.instanceMain(args.toArray(new String[args.size()])), true);
}

@Test(expectedExceptions={UserException.VQSRNegativeModelFailure.class})
@Test(expectedExceptions = {UserException.VQSRNegativeModelFailure.class})
public void testNoNegativeTrainingData() throws IOException {
// use an ArrayList - ArgumentBuilder tokenizes using the "=" in the resource args
List<String> args = new ArrayList<>(alleleSpecificVQSRParamsNoNegativeData.length);
Expand Down
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/bothRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/bothRecalWithAggregate.vcf
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/bothTranches.txt
Git LFS file not shown
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/expected.AS.recal.vcf
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion src/test/resources/large/VQSR/expected/indelTranches.txt
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/snpApplyResult.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/expected/snpSampledRecal.vcf
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
2 changes: 1 addition & 1 deletion src/test/resources/large/VQSR/indelRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpRecal.vcf
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpRecal.vcf.idx
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpTranches.scattered.2.txt
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/large/VQSR/snpTranches.scattered.txt
Git LFS file not shown
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Variant quality score tranches file
# Version number 5
targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity
90.00,0,2507,0.0000,3.2205,-1.4017,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
97.00,0,2702,0.0000,3.2351,-2.1306,VQSRTrancheSNP90.00to97.00,SNP,2786,2702,0.9698
98.00,0,2730,0.0000,3.2457,-2.4533,VQSRTrancheSNP97.00to98.00,SNP,2786,2730,0.9799
99.00,0,2758,0.0000,3.2496,-3.2164,VQSRTrancheSNP98.00to99.00,SNP,2786,2758,0.9899
99.30,0,2766,0.0000,3.2488,-4.0653,VQSRTrancheSNP99.00to99.30,SNP,2786,2766,0.9928
99.40,0,2769,0.0000,3.2469,-5.1648,VQSRTrancheSNP99.30to99.40,SNP,2786,2769,0.9939
99.50,0,2772,0.0000,3.2450,-5.9262,VQSRTrancheSNP99.40to99.50,SNP,2786,2772,0.9950
99.60,0,2774,0.0000,3.2416,-6.5841,VQSRTrancheSNP99.50to99.60,SNP,2786,2774,0.9957
99.80,0,2780,0.0000,3.2314,-58913.3494,VQSRTrancheSNP99.60to99.80,SNP,2786,2780,0.9978
99.90,0,2783,0.0000,3.2295,-59210.6252,VQSRTrancheSNP99.80to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000
99.95,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP100.00to99.95,SNP,2786,2786,1.0000
90.00,0,2507,0.0000,3.2134,935.3268,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
97.00,0,2702,0.0000,3.2484,758.5107,VQSRTrancheSNP90.00to97.00,SNP,2786,2702,0.9698
98.00,0,2730,0.0000,3.2523,710.0883,VQSRTrancheSNP97.00to98.00,SNP,2786,2730,0.9799
99.00,0,2758,0.0000,3.2562,626.3145,VQSRTrancheSNP98.00to99.00,SNP,2786,2758,0.9899
99.30,0,2766,0.0000,3.2423,573.6691,VQSRTrancheSNP99.00to99.30,SNP,2786,2766,0.9928
99.40,0,2769,0.0000,3.2469,511.3402,VQSRTrancheSNP99.30to99.40,SNP,2786,2769,0.9939
99.50,0,2772,0.0000,3.2515,342.2903,VQSRTrancheSNP99.40to99.50,SNP,2786,2772,0.9950
99.60,0,2774,0.0000,3.2481,250.6941,VQSRTrancheSNP99.50to99.60,SNP,2786,2774,0.9957
99.80,0,2780,0.0000,3.2314,-5.7548,VQSRTrancheSNP99.60to99.80,SNP,2786,2780,0.9978
99.90,0,2783,0.0000,3.2295,-12.9282,VQSRTrancheSNP99.80to99.90,SNP,2786,2783,0.9989
99.95,0,2784,0.0000,3.2310,-22753.7269,VQSRTrancheSNP99.90to99.95,SNP,2786,2784,0.9993
100.00,0,2786,0.0000,3.2340,-39326.4087,VQSRTrancheSNP99.95to100.00,SNP,2786,2786,1.0000
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Variant quality score tranches file
# Version number 5
targetTruthSensitivity,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,model,accessibleTruthSites,callsAtTruthSites,truthSensitivity
90.00,0,2507,0.0000,3.2205,-1.4017,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
99.00,0,2758,0.0000,3.2496,-3.2164,VQSRTrancheSNP90.00to99.00,SNP,2786,2758,0.9899
99.90,0,2783,0.0000,3.2295,-59210.6252,VQSRTrancheSNP99.00to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-Infinity,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000
90.00,0,2507,0.0000,3.2134,935.3268,VQSRTrancheSNP0.00to90.00,SNP,2786,2507,0.8999
99.00,0,2758,0.0000,3.2562,626.3145,VQSRTrancheSNP90.00to99.00,SNP,2786,2758,0.9899
99.90,0,2783,0.0000,3.2295,-12.9282,VQSRTrancheSNP99.00to99.90,SNP,2786,2783,0.9989
100.00,0,2786,0.0000,3.2340,-39326.4087,VQSRTrancheSNP99.90to100.00,SNP,2786,2786,1.0000

0 comments on commit b14f810

Please sign in to comment.