Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hard filters in M2 for mitochondria #5842

Merged
merged 2 commits into from
Mar 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts";
public static final String MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME = "median-autosomal-coverage";
public static final String MITOCHONDRIA_MODE_LONG_NAME = "mitochondria-mode";
public static final String CALLABLE_DEPTH_LONG_NAME = "callable-depth";
public static final String PCR_SNV_QUAL_LONG_NAME = "pcr-snv-qual";
Expand Down Expand Up @@ -230,13 +229,6 @@ public double getInitialLod() {
@Argument(fullName= IGNORE_ITR_ARTIFACTS_LONG_NAME, doc="Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats.", optional = true)
public boolean dontClipITRArtifacts = false;

/**
* Used to model autosomal coverage when calling mitochondria. The median tends to be a more robust center statistic.
*/
@Advanced
@Argument(fullName = MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, doc="For mitochondrial calling only; Annotate possible polymorphic NuMT based on Poisson distribution given median autosomal coverage", optional = true)
public double autosomalCoverage;

/**
* When Mutect2 is run in reference confidence mode with banding compression enabled (-ERC GVCF), homozygous-reference
* sites are compressed into bands of similar tumor LOD (TLOD) that are emitted as a single VCF record. See
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,6 @@ public void onTraversalStart() {
public Collection<Annotation> makeVariantAnnotations(){
final Collection<Annotation> annotations = super.makeVariantAnnotations();

if (MTAC.autosomalCoverage > 0) {
annotations.add(new PolymorphicNuMT(MTAC.autosomalCoverage));
}
if (MTAC.mitochondria) {
annotations.add(new OriginalAlignment());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class M2FiltersArgumentCollection {
public double initialPosteriorThreshold = DEFAULT_INITIAL_POSTERIOR_THRESHOLD;

/**
* Mitochondria mode includes the filter{@link ChimericOriginalAlignmentFilter}
* Mitochondria mode includes the filter{@link ChimericOriginalAlignmentFilter} and {@link PolymorphicNuMTFilter},
* and excludes the filters {@link ClusteredEventsFilter}, {@link MultiallelicFilter}, {@link PolymeraseSlippageFilter},
* {@link FilteredHaplotypeFilter}, and {@link GermlineFilter}
*/
Expand All @@ -55,9 +55,10 @@ public class M2FiltersArgumentCollection {
public static final String MAX_MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_LONG_NAME = "max-median-fragment-length-difference";
public static final String MIN_MEDIAN_READ_POSITION_LONG_NAME = "min-median-read-position";
public static final String MAX_N_RATIO_LONG_NAME = "max-n-ratio";
public static final String MIN_LOG_10_ODDS_DIVIDED_BY_DEPTH = "lod-divided-by-depth";
public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand";
public static final String MAX_NUMT_FRACTION_LONG_NAME = "max-numt-fraction";
public static final String MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME = "autosomal-coverage";
public static final String MIN_AF_LONG_NAME = "min-allele-fraction";

private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2;
private static final int DEFAULT_MAX_ALT_ALLELES = 1;
Expand All @@ -69,6 +70,8 @@ public class M2FiltersArgumentCollection {
private static final double DEFAULT_MAX_N_RATIO = Double.POSITIVE_INFINITY;
private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0;
private static final double DEFAULT_MAX_NUMT_FRACTION = 0.85;
private static final double DEFAULT_MEDIAN_AUTOSOMAL_COVERAGE = 0;
private static final double DEFAULT_MIN_AF = 0;

@Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region. Filter all variants if exceeded.")
public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION;
Expand Down Expand Up @@ -97,9 +100,15 @@ public class M2FiltersArgumentCollection {
@Argument(fullName = MIN_READS_ON_EACH_STRAND_LONG_NAME, optional = true, doc = "Minimum alt reads required on both forward and reverse strands")
public int minReadsOnEachStrand = DEFAULT_MIN_READS_ON_EACH_STRAND;

@Argument(fullName = MEDIAN_AUTOSOMAL_COVERAGE_LONG_NAME, optional = true, doc = "Median autosomal coverage for filtering potential polymporphic NuMTs when calling on mitochondria.")
public double medianAutosomalCoverage = DEFAULT_MEDIAN_AUTOSOMAL_COVERAGE;

@Argument(fullName = MAX_NUMT_FRACTION_LONG_NAME, doc="Maximum fraction of alt reads that originally aligned outside the mitochondria. These are due to NuMTs.", optional = true)
public double maxNuMTFraction = DEFAULT_MAX_NUMT_FRACTION;

@Argument(fullName = MIN_AF_LONG_NAME, doc="Minimum allele fraction required", optional = true)
public double minAf = DEFAULT_MIN_AF;


/**
* Input files and values to use if inputs are missing
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;

import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.IntStream;

public class MinAlleleFractionFilter extends HardFilter {
private final double minAf;

public MinAlleleFractionFilter(final double minAf) {
this.minAf = minAf;
}

@Override
public ErrorType errorType() { return ErrorType.ARTIFACT; }

@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
return vc.getGenotypes().stream().filter(filteringEngine::isTumor)
.filter(g -> g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))
.anyMatch(g -> {
final double[] alleleFractions = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 1.0);
final int numRealAlleles = vc.hasSymbolicAlleles() ? alleleFractions.length - 1 : alleleFractions.length;
final OptionalDouble max = IntStream.range(0, numRealAlleles).mapToDouble(a -> alleleFractions[a]).max();
return max.getAsDouble() < minAf;
});
}

@Override
public String filterName() {
return GATKVCFConstants.ALLELE_FRACTION_FILTER_NAME;
}

@Override
protected List<String> requiredAnnotations() { return Collections.emptyList(); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {
filters.add(new NRatioFilter(MTFAC.nRatio));
filters.add(new StrictStrandBiasFilter(MTFAC.minReadsOnEachStrand));
filters.add(new ReadPositionFilter(MTFAC.minMedianReadPosition));
filters.add(new MinAlleleFractionFilter(MTFAC.minAf));

if (!MTFAC.readOrientationPriorTarGzs.isEmpty()) {
final List<File> artifactTables = MTFAC.readOrientationPriorTarGzs.stream().flatMap(tarGz -> {
Expand All @@ -219,6 +220,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {

if (MTFAC.mitochondria) {
filters.add(new ChimericOriginalAlignmentFilter(MTFAC.maxNuMTFraction));
filters.add(new PolymorphicNuMTFilter(MTFAC.medianAutosomalCoverage));
} else {
filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion));
filters.add(new MultiallelicFilter(MTFAC.numAltAllelesThreshold));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.broadinstitute.hellbender.tools.walkers.mutect.filtering;

import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.math3.distribution.PoissonDistribution;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.Collections;
import java.util.List;
import java.util.OptionalInt;
import java.util.stream.IntStream;

public class PolymorphicNuMTFilter extends HardFilter {
private static final double LOWER_BOUND_PROB = .01;
private static final double MULTIPLE_COPIES_MULTIPLIER = 1.5;
private final int maxAltDepthCutoff;

public PolymorphicNuMTFilter(final double medianAutosomalCoverage){
if (medianAutosomalCoverage != 0) {
final PoissonDistribution autosomalCoverage = new PoissonDistribution(medianAutosomalCoverage * MULTIPLE_COPIES_MULTIPLIER);
maxAltDepthCutoff = autosomalCoverage.inverseCumulativeProbability(1 - LOWER_BOUND_PROB);
} else {
maxAltDepthCutoff = 0;
}
}

@Override
public ErrorType errorType() { return ErrorType.NON_SOMATIC; }

@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
return vc.getGenotypes().stream().filter(filteringEngine::isTumor)
.filter(Genotype::hasAD)
.anyMatch(g -> {
final int[] alleleDepths = g.getAD();
final int numRealAlleles = vc.hasSymbolicAlleles() ? alleleDepths.length - 1 : alleleDepths.length;
//Start at first alternate allele depth (the ref allele is first)
final OptionalInt max = IntStream.range(1, numRealAlleles).map(a -> alleleDepths[a]).max();
return max.getAsInt() < maxAltDepthCutoff;
});
}

@Override
public String filterName() {
return GATKVCFConstants.POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME;
}

@Override
protected List<String> requiredAnnotations() { return Collections.emptyList(); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ public final class GATKVCFConstants {

// M2-specific FORMAT keys
public static final String ALLELE_FRACTION_KEY = "AF";
public static final String POTENTIAL_POLYMORPHIC_NUMT_KEY = "NUMT";

//FILTERS
/* Note that many filters used throughout GATK (most notably in VariantRecalibration) are dynamic,
Expand All @@ -171,6 +170,8 @@ their names (or descriptions) depend on some threshold. Those filters are not i
public final static String STRICT_STRAND_BIAS_FILTER_NAME = "strict_strand";
public final static String N_RATIO_FILTER_NAME = "n_ratio";
public final static String CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME = "numt_chimera"; //mitochondria
public final static String ALLELE_FRACTION_FILTER_NAME = "low_allele_frac";
public static final String POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME = "numt_novel";

public static final List<String> MUTECT_FILTER_NAMES = Arrays.asList(POLYMERASE_SLIPPAGE,
PON_FILTER_NAME, CLUSTERED_EVENTS_FILTER_NAME, TUMOR_EVIDENCE_FILTER_NAME, GERMLINE_RISK_FILTER_NAME,
Expand All @@ -179,7 +180,7 @@ their names (or descriptions) depend on some threshold. Those filters are not i
MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_FILTER_NAME,
READ_POSITION_FILTER_NAME, CONTAMINATION_FILTER_NAME, DUPLICATED_EVIDENCE_FILTER_NAME,
READ_ORIENTATION_ARTIFACT_FILTER_NAME, BAD_HAPLOTYPE_FILTER_NAME, CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME,
STRICT_STRAND_BIAS_FILTER_NAME, N_RATIO_FILTER_NAME);
STRICT_STRAND_BIAS_FILTER_NAME, N_RATIO_FILTER_NAME, ALLELE_FRACTION_FILTER_NAME, POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME);

// Symbolic alleles
public final static String SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG = "ALT";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addFilterLine(new VCFFilterHeaderLine(BAD_HAPLOTYPE_FILTER_NAME, "Variant near filtered variant on same haplotype."));
addFilterLine(new VCFFilterHeaderLine(STRICT_STRAND_BIAS_FILTER_NAME, "Evidence for alt allele is not represented in both directions"));
addFilterLine(new VCFFilterHeaderLine(N_RATIO_FILTER_NAME, "Ratio of N to alt exceeds specified ratio"));
addFilterLine(new VCFFilterHeaderLine(ALLELE_FRACTION_FILTER_NAME, "Allele fraction is below specified threshold"));

//Mitochondrial M2-related filters
addFilterLine(new VCFFilterHeaderLine(CHIMERIC_ORIGINAL_ALIGNMENT_FILTER_NAME, "NuMT variant with too many ALT reads originally from autosome"));
addFilterLine(new VCFFilterHeaderLine(POTENTIAL_POLYMORPHIC_NUMT_FILTER_NAME, "Alt depth is below expected coverage of NuMT in autosome"));

addFormatLine(new VCFFormatHeaderLine(ALLELE_BALANCE_KEY, 1, VCFHeaderLineType.Float, "Allele balance for each het genotype"));
addFormatLine(new VCFFormatHeaderLine(MAPPING_QUALITY_ZERO_BY_SAMPLE_KEY, 1, VCFHeaderLineType.Integer, "Number of Mapping Quality Zero Reads per sample"));
Expand Down Expand Up @@ -122,7 +124,6 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addFormatLine(new VCFFormatHeaderLine(ALLELE_FRACTION_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele fractions of alternate alleles in the tumor"));
addFormatLine(new VCFFormatHeaderLine(F1R2_KEY, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Count of reads in F1R2 pair orientation supporting each allele"));
addFormatLine(new VCFFormatHeaderLine(F2R1_KEY, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Count of reads in F2R1 pair orientation supporting each allele"));
addFormatLine(new VCFFormatHeaderLine(POTENTIAL_POLYMORPHIC_NUMT_KEY, 1, VCFHeaderLineType.String, "Potentially a polymorphic NuMT false positive rather than a real mitochondrial variant."));

addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"));
addInfoLine(new VCFInfoHeaderLine(MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"));
Expand Down
Loading