FilterVariantTranches documentation fix and improvement (#5837)

Adds best practices tranche filtering thresholds for single sample CNN filtration
broadinstitute · Jul 2, 2019 · ed33e5b · ed33e5b
1 parent bc96bfd
commit ed33e5b
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 12 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java
@@ -59,6 +59,10 @@
  * {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}.
  * CNNVariantTrain will create a json architecture file and an hd5 weights file, which you can use with this tool.
  *
+ * The advanced argument `info-annotation-keys` is available for models trained with different sets info field annotations.
+ * In order to do this you must first train your own model with the tools {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}.
+ * Otherwise, providing this argument with anything but the standard set of annotations will result in an error.
+ *
  *
  * <h3>1D Model with pre-trained architecture</h3>
  *
@@ -140,6 +144,8 @@ public class CNNScoreVariants extends TwoPassVariantWalker {
     private static final String ANNOTATION_SEPARATOR = ";"; // If changed make change in defines.py
     private static final String ANNOTATION_SET_STRING = "=";// If changed make change in defines.py
 
+    private List<String> defaultAnnotationKeys = new ArrayList<>(Arrays.asList("MQ", "DP", "SOR", "FS", "QD", "MQRankSum", "ReadPosRankSum"));
+
     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
             doc = "Output file")
@@ -164,8 +170,8 @@ public class CNNScoreVariants extends TwoPassVariantWalker {
     private boolean filterSymbolicAndSV = false;
 
     @Advanced
-    @Argument(fullName="info-annotation-keys", shortName="info-annotation-keys", doc="The VCF info fields to send to python.", optional=true)
-    private List<String> annotationKeys = new ArrayList<>(Arrays.asList("MQ", "DP", "SOR", "FS", "QD", "MQRankSum", "ReadPosRankSum"));
+    @Argument(fullName="info-annotation-keys", shortName="info-annotation-keys", doc="The VCF info fields to send to python.  This should only be changed if a new model has been trained which expects the annotations provided here.", optional=true)
+    private List<String> annotationKeys = defaultAnnotationKeys;
 
     @Advanced
     @Argument(fullName = "inference-batch-size", shortName = "inference-batch-size", doc = "Size of batches for python to do inference on.", minValue = 1, maxValue = 4096, optional = true)
@@ -284,10 +290,14 @@ public void onTraversalStart() {
         }
 
         final VCFHeader inputHeader = getHeaderForVariants();
-        if(inputHeader.getGenotypeSamples().size() > 1) {
+        if (inputHeader.getGenotypeSamples().size() > 1) {
             logger.warn("CNNScoreVariants is a single sample tool but the input VCF has more than 1 sample.");
         }
 
+        if (!annotationKeys.equals(defaultAnnotationKeys)){
+            logger.warn("Annotation keys are not the default you must also provide a trained model that expects these annotations.");
+        }
+
         // Start the Python process and initialize a stream writer for streaming data to the Python code
         pythonExecutor.start(Collections.emptyList(), enableJournal, pythonProfileResults);
         pythonExecutor.initStreamWriter(AsynchronousStreamWriter.stringSerializer);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java
@@ -27,6 +27,22 @@
 
 /**
  * Apply tranche filtering to VCF based on scores from an annotation in the INFO field.
+ * The annotation can come from the {@link CNNScoreVariants} tool (CNNLOD), VQSR (VQSLOD),
+ * or any other variant scoring tool which adds numeric annotations in a VCF's INFO field.
+ *
+ * Tranches are specified in percent sensitivity to the variants in the resource files.
+ * For example, if you specify INDEL tranches 98.0 and 99.0 using the CNN_2D score
+ * the filtered VCF will contain 2 filter tranches for INDELS: CNN_2D_INDEL_Tranche_98.00_99.00
+ * and CNN_2D_INDEL_Tranche_99.00_100.00. Variants that scored better than the 98th percentile of variants in the
+ * resources pass through the filter and will have `PASS` in the filter field. We expect variants in the tranche
+ * CNN_2D_INDEL_Tranche_99.00_100.00 to be more sensitive, but less precise than CNN_2D_INDEL_Tranche_98.00_99.00,
+ * because variants in CNN_2D_INDEL_Tranche_99.00_100.00 have lower scores than variants in the tranche
+ * CNN_2D_INDEL_Tranche_98.00_99.00.
+ *
+ * The default tranche filtering threshold for SNPs is 99.95 and for INDELs it is 99.4.
+ * These thresholds maximize the F1 score (the harmonic mean of sensitivity and precision)
+ * for whole genome human data but may need to be tweaked for different datasets.
+ *
  *
  * <h3>Inputs</h3>
  * <ul>
@@ -36,6 +52,9 @@
  *      <li>tranche List of percent sensitivities to the known sites at which we will filter.  Must be between 0 and 100.</li>
  * </ul>
  *
+ * If you want to remove existing filters from your VCF add the argument `--invalidate-previous-filters`.
+ *
+ *
  * <h3>Outputs</h3>
  * <ul>
  * <li>A tranche filtered VCF.</li>
@@ -51,10 +70,40 @@
  *   --resource hapmap.vcf \
  *   --resource mills.vcf \
  *   --info-key CNN_1D \
- *   --tranche 99.9 --tranche 99.0 --tranche 95 \
+ *   --snp-tranche 99.95 \
+ *   --indel-tranche 99.4 \
+ *   -O filtered.vcf
+ * </pre>
+ *
+ * <h4>Apply tranche filters based on the scores in the info field with key CNN_2D
+ * and remove any existing filters from the VCF.</h4>
+ * <pre>
+ * gatk FilterVariantTranches \
+ *   -V input.vcf.gz \
+ *   --resource hapmap.vcf \
+ *   --resource mills.vcf \
+ *   --info-key CNN_2D \
+ *   --snp-tranche 99.95 \
+ *   --indel-tranche 99.4 \
+ *   --invalidate-previous-filters \
  *   -O filtered.vcf
  * </pre>
  *
+ * <h4>Apply several tranche filters based on the scores in the info field with key CNN_2D.</h4>
+ * This will result in a VCF with filters: CNN_2D_SNP_Tranche_99.90_99.95, CNN_2D_SNP_Tranche_99.95_100.00
+ * CNN_2D_INDEL_Tranche_99.00_99.50, CNN_2D_INDEL_Tranche_99.50_100.00.
+ * The interpretation is that `PASS` variants are the best, variants in the tranche `CNN_2D_INDEL_Tranche_99.00_99.50`
+ * are ok and variants in the tranche `CNN_2D_INDEL_Tranche_99.50_100.00` are the worst.
+ * <pre>
+ * gatk FilterVariantTranches \
+ *   -V input.vcf.gz \
+ *   --resource hapmap.vcf \
+ *   --resource mills.vcf \
+ *   --info-key CNN_2D \
+ *   --snp-tranche 99.9 --snp-tranche 99.95 \
+ *   --indel-tranche 99.0 --indel-tranche 99.4 \
+ *   -O filtered.vcf
+ * </pre>
  */
 @DocumentedFeature
 @CommandLineProgramProperties(
@@ -75,15 +124,15 @@ public class FilterVariantTranches extends TwoPassVariantWalker {
                     "Higher numbers mean more desired sensitivity and thus less stringent filtering." +
                     "Specified in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent.",
             optional=true)
-    private List<Double> snpTranches = new ArrayList<>(Arrays.asList(99.9, 99.99));
+    private List<Double> snpTranches = new ArrayList<>(Arrays.asList(99.95));
 
     @Argument(fullName="indel-tranche",
             shortName="indel-tranche",
             doc="The level(s) of sensitivity to indels in the resource VCFs at which to filter indels. " +
                     "Higher numbers mean more desired sensitivity and thus less stringent filtering." +
                     "Specified in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent.",
             optional=true)
-    private List<Double> indelTranches = new ArrayList<>(Arrays.asList(99.0, 99.5));
+    private List<Double> indelTranches = new ArrayList<>(Arrays.asList(99.4));
 
     @Argument(fullName="resource",
             shortName = "resource",
@@ -190,6 +239,10 @@ protected void secondPassApply(VariantContext variant, ReadsContext readsContext
                 filteredIndels++;
             }
         }
+
+        if (builder.getFilters() == null || builder.getFilters().size() == 0){
+            builder.passFilters();
+        }
 
         vcfWriter.add(builder.make());
     }

diff --git a/...rg/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java b/...rg/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranchesIntegrationTest.java
@@ -16,6 +16,25 @@ public class FilterVariantTranchesIntegrationTest  extends CommandLineProgramTes
     /**
      * Run the tool on a small test VCF.
      */
+    @Test
+    public void testTrancheFilteringDefaults() throws IOException {
+        final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
+        final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
+        final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";
+
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
+                .addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
+                .addArgument("resource", snpTruthVCF)
+                .addArgument("resource", indelTruthVCF)
+                .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
+                .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
+
+        final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
+                Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranche_defaults.vcf"));
+        spec.executeTest("testTrancheFiltering", this);
+    }
+
     @Test
     public void testTrancheFiltering() throws IOException {
         final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
@@ -33,7 +52,6 @@ public void testTrancheFiltering() throws IOException {
                 .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
                 .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
 
-
         final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
                     Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99_99.5.vcf"));
         spec.executeTest("testTrancheFiltering", this);
@@ -130,7 +148,6 @@ public void testTrancheFilteringWithOldFilters() throws IOException {
                 .addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
                 .addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
 
-
         final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
                 Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99_with_old_filters.vcf"));
         spec.executeTest("testTrancheFiltering", this);

diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranche_defaults.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranche_defaults.vcf
diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_95_99_99.9.vcf
diff --git a/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99_99.5.vcf b/src/test/resources/large/VQSR/expected/g94982_20_1m_10m_tranched_99_99.5.vcf