Skip to content

Commit

Permalink
FilterVariantTranches documentation fix and improvement (#5837)
Browse files Browse the repository at this point in the history
Adds best practices tranche filtering thresholds for single sample CNN filtration
  • Loading branch information
lucidtronix authored Jul 2, 2019
1 parent bc96bfd commit ed33e5b
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
* {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}.
* CNNVariantTrain will create a json architecture file and an hd5 weights file, which you can use with this tool.
*
* The advanced argument `info-annotation-keys` is available for models trained with different sets info field annotations.
* In order to do this you must first train your own model with the tools {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}.
* Otherwise, providing this argument with anything but the standard set of annotations will result in an error.
*
*
* <h3>1D Model with pre-trained architecture</h3>
*
Expand Down Expand Up @@ -140,6 +144,8 @@ public class CNNScoreVariants extends TwoPassVariantWalker {
private static final String ANNOTATION_SEPARATOR = ";"; // If changed make change in defines.py
private static final String ANNOTATION_SET_STRING = "=";// If changed make change in defines.py

private List<String> defaultAnnotationKeys = new ArrayList<>(Arrays.asList("MQ", "DP", "SOR", "FS", "QD", "MQRankSum", "ReadPosRankSum"));

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc = "Output file")
Expand All @@ -164,8 +170,8 @@ public class CNNScoreVariants extends TwoPassVariantWalker {
private boolean filterSymbolicAndSV = false;

@Advanced
@Argument(fullName="info-annotation-keys", shortName="info-annotation-keys", doc="The VCF info fields to send to python.", optional=true)
private List<String> annotationKeys = new ArrayList<>(Arrays.asList("MQ", "DP", "SOR", "FS", "QD", "MQRankSum", "ReadPosRankSum"));
@Argument(fullName="info-annotation-keys", shortName="info-annotation-keys", doc="The VCF info fields to send to python. This should only be changed if a new model has been trained which expects the annotations provided here.", optional=true)
private List<String> annotationKeys = defaultAnnotationKeys;

@Advanced
@Argument(fullName = "inference-batch-size", shortName = "inference-batch-size", doc = "Size of batches for python to do inference on.", minValue = 1, maxValue = 4096, optional = true)
Expand Down Expand Up @@ -284,10 +290,14 @@ public void onTraversalStart() {
}

final VCFHeader inputHeader = getHeaderForVariants();
if(inputHeader.getGenotypeSamples().size() > 1) {
if (inputHeader.getGenotypeSamples().size() > 1) {
logger.warn("CNNScoreVariants is a single sample tool but the input VCF has more than 1 sample.");
}

if (!annotationKeys.equals(defaultAnnotationKeys)){
logger.warn("Annotation keys are not the default you must also provide a trained model that expects these annotations.");
}

// Start the Python process and initialize a stream writer for streaming data to the Python code
pythonExecutor.start(Collections.emptyList(), enableJournal, pythonProfileResults);
pythonExecutor.initStreamWriter(AsynchronousStreamWriter.stringSerializer);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@

/**
* Apply tranche filtering to VCF based on scores from an annotation in the INFO field.
* The annotation can come from the {@link CNNScoreVariants} tool (CNNLOD), VQSR (VQSLOD),
* or any other variant scoring tool which adds numeric annotations in a VCF's INFO field.
*
* Tranches are specified in percent sensitivity to the variants in the resource files.
* For example, if you specify INDEL tranches 98.0 and 99.0 using the CNN_2D score
* the filtered VCF will contain 2 filter tranches for INDELS: CNN_2D_INDEL_Tranche_98.00_99.00
* and CNN_2D_INDEL_Tranche_99.00_100.00. Variants that scored better than the 98th percentile of variants in the
* resources pass through the filter and will have `PASS` in the filter field. We expect variants in the tranche
* CNN_2D_INDEL_Tranche_99.00_100.00 to be more sensitive, but less precise than CNN_2D_INDEL_Tranche_98.00_99.00,
* because variants in CNN_2D_INDEL_Tranche_99.00_100.00 have lower scores than variants in the tranche
* CNN_2D_INDEL_Tranche_98.00_99.00.
*
* The default tranche filtering threshold for SNPs is 99.95 and for INDELs it is 99.4.
* These thresholds maximize the F1 score (the harmonic mean of sensitivity and precision)
* for whole genome human data but may need to be tweaked for different datasets.
*
*
* <h3>Inputs</h3>
* <ul>
Expand All @@ -36,6 +52,9 @@
* <li>tranche List of percent sensitivities to the known sites at which we will filter. Must be between 0 and 100.</li>
* </ul>
*
* If you want to remove existing filters from your VCF add the argument `--invalidate-previous-filters`.
*
*
* <h3>Outputs</h3>
* <ul>
* <li>A tranche filtered VCF.</li>
Expand All @@ -51,10 +70,40 @@
* --resource hapmap.vcf \
* --resource mills.vcf \
* --info-key CNN_1D \
* --tranche 99.9 --tranche 99.0 --tranche 95 \
* --snp-tranche 99.95 \
* --indel-tranche 99.4 \
* -O filtered.vcf
* </pre>
*
* <h4>Apply tranche filters based on the scores in the info field with key CNN_2D
* and remove any existing filters from the VCF.</h4>
* <pre>
* gatk FilterVariantTranches \
* -V input.vcf.gz \
* --resource hapmap.vcf \
* --resource mills.vcf \
* --info-key CNN_2D \
* --snp-tranche 99.95 \
* --indel-tranche 99.4 \
* --invalidate-previous-filters \
* -O filtered.vcf
* </pre>
*
* <h4>Apply several tranche filters based on the scores in the info field with key CNN_2D.</h4>
* This will result in a VCF with filters: CNN_2D_SNP_Tranche_99.90_99.95, CNN_2D_SNP_Tranche_99.95_100.00
* CNN_2D_INDEL_Tranche_99.00_99.50, CNN_2D_INDEL_Tranche_99.50_100.00.
* The interpretation is that `PASS` variants are the best, variants in the tranche `CNN_2D_INDEL_Tranche_99.00_99.50`
* are ok and variants in the tranche `CNN_2D_INDEL_Tranche_99.50_100.00` are the worst.
* <pre>
* gatk FilterVariantTranches \
* -V input.vcf.gz \
* --resource hapmap.vcf \
* --resource mills.vcf \
* --info-key CNN_2D \
* --snp-tranche 99.9 --snp-tranche 99.95 \
* --indel-tranche 99.0 --indel-tranche 99.4 \
* -O filtered.vcf
* </pre>
*/
@DocumentedFeature
@CommandLineProgramProperties(
Expand All @@ -75,15 +124,15 @@ public class FilterVariantTranches extends TwoPassVariantWalker {
"Higher numbers mean more desired sensitivity and thus less stringent filtering." +
"Specified in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent.",
optional=true)
private List<Double> snpTranches = new ArrayList<>(Arrays.asList(99.9, 99.99));
private List<Double> snpTranches = new ArrayList<>(Arrays.asList(99.95));

@Argument(fullName="indel-tranche",
shortName="indel-tranche",
doc="The level(s) of sensitivity to indels in the resource VCFs at which to filter indels. " +
"Higher numbers mean more desired sensitivity and thus less stringent filtering." +
"Specified in percents, i.e. 99.9 for 99.9 percent and 1.0 for 1 percent.",
optional=true)
private List<Double> indelTranches = new ArrayList<>(Arrays.asList(99.0, 99.5));
private List<Double> indelTranches = new ArrayList<>(Arrays.asList(99.4));

@Argument(fullName="resource",
shortName = "resource",
Expand Down Expand Up @@ -190,6 +239,10 @@ protected void secondPassApply(VariantContext variant, ReadsContext readsContext
filteredIndels++;
}
}

if (builder.getFilters() == null || builder.getFilters().size() == 0){
builder.passFilters();
}

vcfWriter.add(builder.make());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,25 @@ public class FilterVariantTranchesIntegrationTest extends CommandLineProgramTes
/**
* Run the tool on a small test VCF.
*/
@Test
public void testTrancheFilteringDefaults() throws IOException {
final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
final String indelTruthVCF = largeFileTestDir + "VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf";
final String snpTruthVCF = largeFileTestDir + "VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf";

final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
argsBuilder.addArgument(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF)
.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, "%s")
.addArgument("resource", snpTruthVCF)
.addArgument("resource", indelTruthVCF)
.addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
.addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");

final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranche_defaults.vcf"));
spec.executeTest("testTrancheFiltering", this);
}

@Test
public void testTrancheFiltering() throws IOException {
final String trancheVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz";
Expand All @@ -33,7 +52,6 @@ public void testTrancheFiltering() throws IOException {
.addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
.addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");


final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99_99.5.vcf"));
spec.executeTest("testTrancheFiltering", this);
Expand Down Expand Up @@ -130,7 +148,6 @@ public void testTrancheFilteringWithOldFilters() throws IOException {
.addArgument("info-key", "MIX_SMALL_2D_W_DROPOUT")
.addArgument(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");


final IntegrationTestSpec spec = new IntegrationTestSpec(argsBuilder.toString(),
Arrays.asList(largeFileTestDir + "VQSR/expected/g94982_20_1m_10m_tranched_99_with_old_filters.vcf"));
spec.executeTest("testTrancheFiltering", this);
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

0 comments on commit ed33e5b

Please sign in to comment.