likelihoods, final VariantContext vc){
final int[][] table = getContingencyTable(likelihoods, vc, MIN_COUNT);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
new file mode 100644
index 00000000000..ab00261bc57
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
@@ -0,0 +1,327 @@
+package org.broadinstitute.hellbender.tools.walkers.annotator;
+
+import com.google.common.collect.Lists;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.variantcontext.writer.VariantContextWriter;
+import htsjdk.variant.vcf.*;
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
+import org.broadinstitute.hellbender.cmdline.argumentcollections.VariantAnnotationArgumentCollection;
+import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
+import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerUtils;
+import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.genotyper.*;
+import org.broadinstitute.hellbender.utils.genotyper.SampleList;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.io.File;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Annotate variant calls with context information
+ *
+ *
+ * This tool is designed to annotate variant calls based on their context (as opposed to functional annotation).
+ * Various annotation modules are available; see the "Annotation Modules" page linked in the Tool Documentation sidebar for a complete list.
+ *
+ *
Input
+ *
+ * A variant set to annotate and optionally one or more BAM files.
+ *
+ *
+ * Output
+ *
+ * An annotated VCF.
+ *
+ *
+ * Usage examples
+ *
+ *
+ * Annotate a VCF with dbSNP IDs and depth of coverage for each sample
+ *
+ * VariantAnnotator \
+ * -R reference.fasta \
+ * -I input.bam \
+ * -V input.vcf \
+ * -o output.vcf \
+ * -A Coverage \
+ * -L input.vcf \
+ * --dbsnp dbsnp.vcf
+ *
+ *
+ * Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF
+ *
+ * VariantAnnotator \
+ * -R reference.fasta \
+ * -I input.bam \
+ * -V input.vcf \
+ * -o output.vcf \
+ * -L input.vcf \
+ * --resource foo:resource.vcf \
+ * -E foo.AF \
+ * --resourceAlleleConcordance
+ *
+ *
+ * Annotate with AF and FILTER fields from an external resource
+ *
+ * VariantAnnotator \
+ * -R reference.fasta \
+ * -V input.vcf \
+ * -o output.vcf \
+ * --resource foo:resource.vcf \
+ * --expression foo.AF \
+ * --expression foo.FILTER
+ *
+ *
+ */
+@CommandLineProgramProperties(summary="Tool for adding annotations to VCF files",
+ oneLineSummary = "Tool for adding annotations to VCF files",
+ programGroup = VariantProgramGroup.class)
+
+public class VariantAnnotator extends VariantWalker {
+ private VariantContextWriter vcfWriter;
+
+ /**
+ * The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves.
+ */
+ @ArgumentCollection
+ protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+
+ /**
+ * If a call overlaps with a record from the provided comp track, the INFO field will be annotated
+ * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). Records that are
+ * filtered in the comp track will be ignored. Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
+ */
+ @Advanced
+ @Argument(fullName = "comp", shortName = "comp", doc = "Comparison VCF file(s)", optional = true)
+ public List> comps = new ArrayList<>();
+
+ /**
+ * An external resource VCF file or files from which to annotate.
+ *
+ * Use this option to add annotations from a resource file to the output.
+ * For example, if you want to annotate your callset with the AC field value from a VCF file named
+ * 'resource_file.vcf', you tag it with '-resource:my_resource resource_file.vcf' and you additionally specify
+ * '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
+ * VCF, any records for which there is a record at the same position in the resource file will be annotated with
+ * 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
+ * position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
+ * the match is based on position only.
+ */
+ @Argument(fullName="resource", shortName = "resource", doc="External resource VCF file", optional=true)
+ public List> resources;
+
+ @Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc="The file to whcih variants should be written", optional=false)
+ protected File outputFile;
+
+ /**
+ * See the --list argument to view available annotations.
+ */
+ @ArgumentCollection
+ public VariantAnnotationArgumentCollection variantAnnotationArgumentCollection = new VariantAnnotationArgumentCollection(
+ Collections.emptyList(),
+ Collections.emptyList(),
+ Collections.emptyList());
+
+ /**
+ * This option enables you to add annotations from one VCF to another.
+ *
+ * For example, if you want to annotate your callset with the AC field value from a VCF file named
+ * 'resource_file.vcf', you tag it with '-resource:my_resource resource_file.vcf' (see the -resource argument, also
+ * documented on this page) and you specify '-E my_resource.AC'. In the resulting output VCF, any records for
+ * which there is a record at the same position in the resource file will be annotated with 'my_resource.AC=N'.
+ * INFO field data, ID, ALT, and FILTER fields may be used as expression values.
+ * Note that if there are multiple records in the resource file that overlap the given position, one is chosen
+ * randomly.
+ */
+ @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", optional=true)
+ protected Set expressionsToUse = new HashSet<>();
+
+ /**
+ * If this argument is specified, add annotations (specified by --expression) from an external resource
+ * (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
+ * concordant between input and the resource VCFs. Otherwise, always add the annotations.
+ */
+ @Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", optional=true)
+ protected Boolean expressionAlleleConcordance = false;
+
+ /**
+ * You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
+ * annotations may not be actually applied if they are not applicable to the data provided or if they are
+ * unavailable to the tool (e.g. there are several annotations that are currently not hooked up to
+ * HaplotypeCaller). At present no error or warning message will be provided, the annotation will simply be
+ * skipped silently. You can check the output VCF header to see which annotations were actually applied (although
+ * this does not guarantee that the annotation was applied to all records in the VCF, since some annotations have
+ * additional requirements, e.g. minimum number of samples or heterozygous sites only -- see the documentation
+ * for individual annotations' requirements).
+ */
+ @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", optional=true)
+ protected Boolean USE_ALL_ANNOTATIONS = false;
+
+ /**
+ * Note that the --list argument requires a fully resolved and correct command-line to work. As an alternative,
+ * you can use ListAnnotations (see Help Utilities).
+ */
+ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit", optional=true)
+ protected Boolean LIST = false;
+
+ /**
+ * By default, a dbSNP ID is added only when the ID field in the variant record is empty (not already annotated).
+ * This argument allows you to override that behavior, and appends the new ID to the existing one. This is used
+ * in conjunction with the -dbsnp argument.
+ */
+ @Argument(fullName="alwaysAppendDbsnpId", shortName="alwaysAppendDbsnpId", doc="Add dbSNP ID even if one is already present", optional=true)
+ protected Boolean ALWAYS_APPEND_DBSNP_ID = false;
+ public boolean alwaysAppendDbsnpId() { return ALWAYS_APPEND_DBSNP_ID; }
+
+ /**
+ * The genotype quality (GQ) threshold above which the mendelian violation ratio should be annotated.
+ */
+ @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",optional=true,doc="GQ threshold for annotating MV ratio")
+ public double minGenotypeQualityP = 0.0;
+
+ private VariantAnnotatorEngine annotatorEngine;
+ private SampleList variantSamples;
+
+ /**
+ * Prepare the output file and the list of available features.
+ */
+ public void onTraversalStart() {
+ //TODO currently there is no way to list all of the GATK available annotations using the '-list' argument, in GATK3 this was accomplished
+ //TODO through a helper class, the barclay plugin refactor for annoations will probably accomplis this
+// if ( LIST ) {
+// AnnotationHelpUtils.listAnnotations();
+// System.exit(0);
+// }
+
+ // get the list of all sample names from the variant VCF input rod, if applicable
+ final List samples = getHeaderForVariants().getGenotypeSamples(); // TODO right samples?
+ variantSamples = new IndexedSampleList(samples);
+
+ if ( USE_ALL_ANNOTATIONS ) {
+ annotatorEngine = VariantAnnotatorEngine.ofAllMinusExcluded(variantAnnotationArgumentCollection.annotationsToExclude, dbsnp.dbsnp, comps, false);
+ } else {
+ annotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, comps, false);
+ }
+ //TODO add expressions?
+ annotatorEngine.addExpressions(expressionsToUse, resources, expressionAlleleConcordance );
+
+ // setup the header fields
+ // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
+ final Set hInfo = new HashSet<>();
+
+ hInfo.addAll(annotatorEngine.getVCFAnnotationDescriptions(false)); //TODO allele specific annotations
+ hInfo.addAll(getHeaderForVariants().getMetaDataInInputOrder());
+
+ // for the expressions, pull the info header line from the header of the resource rod
+ for ( final VariantAnnotatorEngine.VAExpression expression : annotatorEngine.getRequestedExpressions() ) {
+ // special case the ID field
+ if ( expression.fieldName.equals("ID") ) {
+ hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource"));
+ } else {
+ VCFInfoHeaderLine targetHeaderLine = null;
+ for (final VCFHeaderLine line : ((VCFHeader) getHeaderForFeatures(expression.binding)).getMetaDataInInputOrder()) {
+ if (line instanceof VCFInfoHeaderLine) {
+ final VCFInfoHeaderLine infoline = (VCFInfoHeaderLine) line;
+ if (infoline.getID().equals(expression.fieldName)) {
+ targetHeaderLine = infoline;
+ expression.hInfo = targetHeaderLine;
+ break;
+ }
+ }
+ }
+
+ VCFInfoHeaderLine lineToAdd;
+ if (targetHeaderLine != null) {
+ if (targetHeaderLine.getCountType() == VCFHeaderLineCount.INTEGER) {
+ lineToAdd = new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCount(), targetHeaderLine.getType(), targetHeaderLine.getDescription());
+ } else {
+ lineToAdd = new VCFInfoHeaderLine(expression.fullName, targetHeaderLine.getCountType(), targetHeaderLine.getType(), targetHeaderLine.getDescription());
+ }
+ } else {
+ lineToAdd = new VCFInfoHeaderLine(expression.fullName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource");
+ }
+ hInfo.add(lineToAdd);
+ expression.hInfo = lineToAdd;
+ }
+ }
+
+ // TODO ask reviewer, VCFUtils.withUpdatedContigs is what GATK3 calls into, it isn't used anywhere in 4 though so should it be used here?
+ VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
+ vcfWriter = createVCFWriter(outputFile);
+ vcfWriter.writeHeader(VCFUtils.withUpdatedContigs(vcfHeader, referenceArguments.getReferenceFile(), referenceArguments.getReferenceFile()==null ? getBestAvailableSequenceDictionary(): getReferenceDictionary()));
+ }
+
+ /**
+ * For each site of interest, annotate based on the requested annotation types
+ *
+ * @param vc
+ * @param readsContext Reads overlapping the current variant. Will be an empty, but non-null, context object
+ * if there is no backing source of reads data (in which case all queries on it will return
+ * an empty array/iterator)
+ * @param refContext
+ * @param fc
+ */
+ @Override
+ public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) {
+
+ // if the reference is present and base is not ambiguous, we can annotate
+ if (refContext.getBases().length ==0 || BaseUtils.simpleBaseToBaseIndex(refContext.getBase()) != -1 ) {
+ //TODO remove this filter and update the tests, we limit reads starting within a spanning deletion to match GATK3 at the expense of matching haplotype caller
+ List reads = new ArrayList<>();
+ readsContext.iterator().forEachRemaining(r -> {if (r.getStart()<=vc.getStart()) {reads.add(r);}});
+
+ ReadLikelihoods likelihoods = new UnfilledReadsLikelihoods<>( variantSamples, new IndexedAlleleList<>(vc.getAlleles()),
+ AssemblyBasedCallerUtils.splitReadsBySample(variantSamples, getHeaderForReads(), reads));
+
+ VariantContext annotatedVC = annotatorEngine.annotateContext(vc, fc, refContext, likelihoods, a -> true);
+ vcfWriter.add(annotatedVC);
+
+ } else {
+ vcfWriter.add(vc);
+ }
+ }
+
+ public List getDefaultReadFilters() {
+ return Lists.newArrayList( new WellformedReadFilter(),
+ ReadFilterLibrary.NOT_DUPLICATE,
+ ReadFilterLibrary.PRIMARY_ALIGNMENT,
+ ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK,
+ ReadFilterLibrary.MAPPED);
+ }
+
+ /**
+ * Tell the user the number of loci processed and close out the new variants file.
+ */
+ @Override
+ public Object onTraversalSuccess() {
+ vcfWriter.close();
+ return null;
+ }
+
+ //TODO need?
+ public final List getHeaderForResourceVariants() {
+ final List headers = Lists.newArrayList(getHeaderForVariants());
+
+ headers.addAll(resources.stream().map(this::getHeaderForFeatures).map(he -> {
+ if ( ! (he instanceof VCFHeader) ) {
+ throw new GATKException("Header for a resources file is not in VCF header format");
+ } else return (VCFHeader) he;
+ }).collect(Collectors.toList()));
+ return headers;
+ }
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
index 993bbbaa71a..23d79f28bd8 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -9,11 +9,14 @@
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
import org.broadinstitute.hellbender.utils.ClassUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
+import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.reflections.ReflectionUtils;
import java.util.*;
@@ -30,16 +33,21 @@ public final class VariantAnnotatorEngine {
private final List infoAnnotations;
private final List genotypeAnnotations;
private Set reducibleKeys;
+ private List expressions = new ArrayList<>();
private final VariantOverlapAnnotator variantOverlapAnnotator;
+ private Boolean expressionAlleleConcordance;
+ private final Boolean useRawAnnotations;
private VariantAnnotatorEngine(final AnnotationManager annots,
final FeatureInput dbSNPInput,
- final List> featureInputs){
+ final List> featureInputs,
+ final boolean useRaw){
infoAnnotations = annots.createInfoFieldAnnotations();
genotypeAnnotations = annots.createGenotypeAnnotations();
variantOverlapAnnotator = initializeOverlapAnnotator(dbSNPInput, featureInputs);
reducibleKeys = new HashSet<>();
+ useRawAnnotations = useRaw;
for (InfoFieldAnnotation annot : infoAnnotations) {
if (annot instanceof ReducibleAnnotation) {
reducibleKeys.add(((ReducibleAnnotation) annot).getRawKeyName());
@@ -59,10 +67,11 @@ private VariantAnnotatorEngine(final AnnotationManager annots,
*/
public static VariantAnnotatorEngine ofAllMinusExcluded(final List annotationsToExclude,
final FeatureInput dbSNPInput,
- final List> comparisonFeatureInputs) {
+ final List> comparisonFeatureInputs,
+ final Boolean useRawAnnotations) {
Utils.nonNull(annotationsToExclude, "annotationsToExclude is null");
Utils.nonNull(comparisonFeatureInputs, "comparisonFeatureInputs is null");
- return new VariantAnnotatorEngine(AnnotationManager.ofAllMinusExcluded(annotationsToExclude), dbSNPInput, comparisonFeatureInputs);
+ return new VariantAnnotatorEngine(AnnotationManager.ofAllMinusExcluded(annotationsToExclude), dbSNPInput, comparisonFeatureInputs, useRawAnnotations);
}
/**
@@ -86,7 +95,33 @@ public static VariantAnnotatorEngine ofSelectedMinusExcluded(final List
Utils.nonNull(annotationsToUse, "annotationsToUse is null");
Utils.nonNull(annotationsToExclude, "annotationsToExclude is null");
Utils.nonNull(comparisonFeatureInputs, "comparisonFeatureInputs is null");
- return new VariantAnnotatorEngine(AnnotationManager.ofSelectedMinusExcluded(annotationGroupsToUse, annotationsToUse, annotationsToExclude), dbSNPInput, comparisonFeatureInputs);
+ return new VariantAnnotatorEngine(AnnotationManager.ofSelectedMinusExcluded(annotationGroupsToUse, annotationsToUse, annotationsToExclude), dbSNPInput, comparisonFeatureInputs, false);
+ }
+
+ /**
+ * Makes the engine for given annotation types and groups (minus the excluded ones).
+ * @param annotationGroupsToUse list of annotations groups to include
+ * @param annotationsToUse list of of annotations to include
+ * @param annotationsToExclude list of annotations to exclude
+ * @param dbSNPInput input for variants from a known set from DbSNP or null if not provided.
+ * The annotation engine will mark variants overlapping anything in this set using {@link htsjdk.variant.vcf.VCFConstants#DBSNP_KEY}.
+ * @param comparisonFeatureInputs list of inputs with known variants.
+ * The annotation engine will mark variants overlapping anything in those sets using the name given by {@link FeatureInput#getName()}.
+ * Note: the DBSNP FeatureInput should be passed in separately, and not as part of this List - an GATKException will be thrown otherwise.
+ * Note: there are no non-DBSNP comparison FeatureInputs an empty List should be passed in here, rather than null.
+ * @param useRawAnnotations Specify whether the annotation engine will attempt to output raw annotations for reducible annotations
+ */
+ public static VariantAnnotatorEngine ofSelectedMinusExcluded(final List annotationGroupsToUse,
+ final List annotationsToUse,
+ final List annotationsToExclude,
+ final FeatureInput dbSNPInput,
+ final List> comparisonFeatureInputs,
+ final Boolean useRawAnnotations) {
+ Utils.nonNull(annotationGroupsToUse, "annotationGroupsToUse is null");
+ Utils.nonNull(annotationsToUse, "annotationsToUse is null");
+ Utils.nonNull(annotationsToExclude, "annotationsToExclude is null");
+ Utils.nonNull(comparisonFeatureInputs, "comparisonFeatureInputs is null");
+ return new VariantAnnotatorEngine(AnnotationManager.ofSelectedMinusExcluded(annotationGroupsToUse, annotationsToUse, annotationsToExclude), dbSNPInput, comparisonFeatureInputs, useRawAnnotations);
}
/**
@@ -103,11 +138,12 @@ public static VariantAnnotatorEngine ofSelectedMinusExcluded(final List
*/
public static VariantAnnotatorEngine ofSelectedMinusExcluded(final VariantAnnotationArgumentCollection argumentCollection,
final FeatureInput dbSNPInput,
- final List> comparisonFeatureInputs) {
+ final List> comparisonFeatureInputs,
+ final Boolean useRawAnnotations) {
return ofSelectedMinusExcluded(argumentCollection.annotationGroupsToUse,
argumentCollection.annotationsToUse,
argumentCollection.annotationsToExclude,
- dbSNPInput, comparisonFeatureInputs);
+ dbSNPInput, comparisonFeatureInputs, useRawAnnotations);
}
private VariantOverlapAnnotator initializeOverlapAnnotator(final FeatureInput dbSNPInput, final List> featureInputs) {
final Map, String> overlaps = new LinkedHashMap<>();
@@ -246,6 +282,7 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
* @param ref the reference context of the variant to annotate or null if there is none
* @param likelihoods likelihoods indexed by sample, allele, and read within sample. May be null
* @param addAnnot function that indicates if the given annotation type should be added to the variant
+ *
*/
public VariantContext annotateContext(final VariantContext vc,
final FeatureContext features,
@@ -261,9 +298,16 @@ public VariantContext annotateContext(final VariantContext vc,
final VariantContext newGenotypeAnnotatedVC = builder.make();
final Map infoAnnotMap = new LinkedHashMap<>(newGenotypeAnnotatedVC.getAttributes());
+ annotateExpressions(vc, features, ref, infoAnnotMap);
+
for ( final InfoFieldAnnotation annotationType : this.infoAnnotations) {
if (addAnnot.test(annotationType)){
- final Map annotationsFromCurrentType = annotationType.annotate(ref, newGenotypeAnnotatedVC, likelihoods);
+ final Map annotationsFromCurrentType;
+ if (useRawAnnotations && annotationType instanceof ReducibleAnnotation) {
+ annotationsFromCurrentType = ((ReducibleAnnotation) annotationType).annotateRawData(ref, newGenotypeAnnotatedVC, likelihoods);
+ } else {
+ annotationsFromCurrentType = annotationType.annotate(ref, newGenotypeAnnotatedVC, likelihoods);
+ }
if ( annotationsFromCurrentType != null ) {
infoAnnotMap.putAll(annotationsFromCurrentType);
}
@@ -421,5 +465,189 @@ private List filterAnnotations(final List al
}
}
+ protected static class VAExpression {
+
+ public String fullName, fieldName;
+ public FeatureInput binding;
+ public VCFInfoHeaderLine hInfo;
+
+ public VAExpression(String fullExpression, List> dataSourceList){
+ final int indexOfDot = fullExpression.lastIndexOf(".");
+ if ( indexOfDot == -1 ) {
+ throw new UserException.BadInput("The requested expression '"+fullExpression+"' is invalid, it should be in VCFFile.value format");
+ }
+
+ fullName = fullExpression;
+ fieldName = fullExpression.substring(indexOfDot+1);
+
+ final String bindingName = fullExpression.substring(0, indexOfDot);
+ for ( final FeatureInput ds : dataSourceList ) {
+ if ( ds.getName().equals(bindingName) ) {
+ binding = ds;
+ break;
+ }
+ }
+ }
+ }
+
+ protected List getRequestedExpressions() { return expressions; }
+
+ // select specific expressions to use
+ public void addExpressions(Set expressionsToUse, List> dataSources, boolean expressionAlleleConcordance) {//, Set) {
+ // set up the expressions
+ for ( final String expression : expressionsToUse ) {
+ expressions.add(new VAExpression(expression, dataSources));
+ }
+ this.expressionAlleleConcordance = expressionAlleleConcordance;
+ }
+
+ /**
+ * Handles logic for expressions for variant contexts. Used to add annotations from one vcf file into the fields
+ * of another if the variant contexts match sufficiently between the two files.
+ *
+ * @param vc VariantContext to add annotations to
+ * @param features FeatureContext object containing extra VCF features to add to vc
+ * @param ref Reference context object corresponding to the region overlapping vc
+ * @param attributes running list of attributes into which to place new annotations
+ */
+ private void annotateExpressions(final VariantContext vc,
+ final FeatureContext features,
+ final ReferenceContext ref,
+ final Map attributes){
+ Utils.nonNull(vc);
+
+ // each requested expression
+ for ( final VAExpression expression : expressions ) {
+ List variantContexts = features.getValues(expression.binding, vc.getStart());
+
+ if (!variantContexts.isEmpty()) {
+ // get the expression's variant context
+ VariantContext expressionVC = variantContexts.iterator().next();
+
+ // special-case the ID field
+ if (expression.fieldName.equals("ID")) {
+ if (expressionVC.hasID()) {
+ attributes.put(expression.fullName, expressionVC.getID());
+ }
+ } else if (expression.fieldName.equals("ALT")) {
+ attributes.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
+ } else if (expression.fieldName.equals("FILTER")) {
+ if (expressionVC.isFiltered()) {
+ attributes.put(expression.fullName, expressionVC.getFilters().toString().replace("[", "").replace("]", "").replace(" ", ""));
+ } else {
+ attributes.put(expression.fullName, "PASS");
+ }
+ } else if (expressionVC.hasAttribute(expression.fieldName)) {
+ // find the info field
+ final VCFInfoHeaderLine hInfo = expression.hInfo;
+ if (hInfo == null) {
+ throw new UserException("Cannot annotate expression " + expression.fullName + " at " + ref.getInterval() + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
+ }
+
+ //
+ // Add the info field annotations
+ //
+ final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
+ final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
+
+ // Annotation uses ref and/or alt alleles or enforce allele concordance
+ if ((useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance) {
+
+ // remove brackets and spaces from expression value
+ final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName,"").toString().replaceAll("[\\[\\]\\s]", "");
+
+ // get comma separated expression values
+ final ArrayList expressionValuesList = new ArrayList<>(Arrays.asList(cleanedExpressionValue.split(",")));
+
+ boolean canAnnotate = false;
+ // get the minimum biallelics without genotypes
+
+ final List minBiallelicVCs = getMinRepresentationBiallelics(vc);
+ final List minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
+
+ // check concordance
+ final List annotationValues = new ArrayList<>();
+ for (final VariantContext biallelicVC : minBiallelicVCs) {
+ // check that ref and alt alleles are the same
+ List exprAlleles = biallelicVC.getAlleles();
+ boolean isAlleleConcordant = false;
+ int i = 0;
+ for (final VariantContext biallelicExprVC : minBiallelicExprVCs) {
+ List alleles = biallelicExprVC.getAlleles();
+ // concordant
+ if (alleles.equals(exprAlleles)) {
+ // get the value for the reference if needed.
+ if (i == 0 && useRefAndAltAlleles) {
+ annotationValues.add(expressionValuesList.get(i++));
+ }
+ // use annotation expression and add to vc
+ annotationValues.add(expressionValuesList.get(i));
+ isAlleleConcordant = true;
+ canAnnotate = true;
+ break;
+ }
+ i++;
+ }
+
+ // can not find allele match so set to annotation value to zero
+ if (!isAlleleConcordant) {
+ annotationValues.add("0");
+ }
+ }
+
+ // some allele matches so add the annotation values
+ if (canAnnotate) {
+ attributes.put(expression.fullName, annotationValues);
+ }
+ } else {
+ // use all of the expression values
+ attributes.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
+ *
+ * @param vc variant context to annotate
+ * @return list of biallelics trimmed to a minimum representation
+ */
+ private List getMinRepresentationBiallelics(final VariantContext vc) {
+ final List minRepresentationBiallelicVCs = new ArrayList<>();
+ if (vc.getNAlleles() > 2) {
+ // TODO, this doesn't actually need to be done, we can simulate it at less cost
+ for (int i = 1; i < vc.getNAlleles(); i++) {
+ // Determining if the biallelic would have been considered a SNP
+ if (! (vc.getReference().length() == 1 && vc.getAlternateAllele(i-1).length() == 1) ) {
+ minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(
+ new VariantContextBuilder(vc)
+ .alleles(Arrays.asList(vc.getReference(),vc.getAlternateAllele(i-1)))
+ .attributes(removeIrrelevantAttributes(vc.getAttributes())).make(), true, true));
+ } else {
+ minRepresentationBiallelicVCs.add(new VariantContextBuilder(vc)
+ .alleles(Arrays.asList(vc.getReference(),vc.getAlternateAllele(i-1)))
+ .attributes(removeIrrelevantAttributes(vc.getAttributes())).make());
+ }
+ }
+ } else {
+ minRepresentationBiallelicVCs.add(vc);
+ }
+
+ return minRepresentationBiallelicVCs;
+ }
+
+ private Map removeIrrelevantAttributes(Map attributes) {
+ // since the VC has been subset, remove the invalid attributes
+ Map ret = new HashMap<>(attributes);
+ for ( final String key : attributes.keySet() ) {
+ if ( !(key.equals(VCFConstants.ALLELE_COUNT_KEY) || key.equals(VCFConstants.ALLELE_FREQUENCY_KEY) || key.equals(VCFConstants.ALLELE_NUMBER_KEY)) ) {
+ ret.remove(key);
+ }
+ }
+
+ return ret;
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RMSMappingQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RMSMappingQuality.java
index 9184f057a75..7d80dd3ce8f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RMSMappingQuality.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RMSMappingQuality.java
@@ -13,6 +13,7 @@
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
@@ -49,7 +50,7 @@ public final class AS_RMSMappingQuality extends InfoFieldAnnotation implements A
private final String printFormat = "%.2f";
- private static final Logger logger = Logger.getLogger(AS_RMSMappingQuality.class);
+ private static final OneShotLogger logger = new OneShotLogger(AS_RMSMappingQuality.class);
public static final String SPLIT_DELIM = "\\|"; //String.split takes a regex, so we need to escape the pipe
public static final String PRINT_DELIM = "|";
@@ -58,26 +59,51 @@ public final class AS_RMSMappingQuality extends InfoFieldAnnotation implements A
public Map annotate(final ReferenceContext ref,
final VariantContext vc,
final ReadLikelihoods likelihoods) {
- return annotateRawData(ref, vc, likelihoods);
+ Utils.nonNull(vc);
+ if ( likelihoods == null) {
+ return Collections.emptyMap();
+ }
+ final Map annotations = new HashMap<>();
+ final ReducibleAnnotationData myData = new ReducibleAnnotationData<>(null);
+ getRMSDataFromLikelihoods(likelihoods, myData);;
+ final String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
+ annotations.put(getKeyNames().get(0), annotationString);
+ return annotations;
}
+
@Override
public Map annotateRawData(final ReferenceContext ref,
final VariantContext vc,
final ReadLikelihoods likelihoods ) {
Utils.nonNull(vc);
- if ( likelihoods == null) {
+ if ( likelihoods == null || !likelihoods.hasFilledLiklihoods()) {
return Collections.emptyMap();
}
final Map annotations = new LinkedHashMap<>();
final ReducibleAnnotationData myData = new ReducibleAnnotationData<>(null);
- calculateRawData(vc, likelihoods, myData);
+ getRMSDataFromLikelihoods(likelihoods, myData);;
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
annotations.put(getRawKeyName(), annotationString);
return annotations;
}
+ @SuppressWarnings({"unchecked", "rawtypes"})//FIXME
+ public void calculateRawData(final VariantContext vc,
+ final ReadLikelihoods likelihoods,
+ final ReducibleAnnotationData myData){
+ //For the raw data here, we're only keeping track of the sum of the squares of our values
+ //When we go to reduce, we'll use the AD info to get the number of reads
+
+ //must use likelihoods for allele-specific annotations
+ if (likelihoods == null) {
+ return;
+ }
+ getRMSDataFromLikelihoods(likelihoods, myData);
+ }
+
+
/**
* For AS_RMSMappingQuality annotations, the annotations will simply consist of a list of the total value for
* every allele computed by parsing all of the individual AS_RMSMappingQuality Raw Key values as doubles
@@ -152,19 +178,6 @@ public Map finalizeRawData(final VariantContext vc, final Varian
return annotations;
}
- @SuppressWarnings({"unchecked", "rawtypes"})//FIXME
- public void calculateRawData(final VariantContext vc,
- final ReadLikelihoods likelihoods,
- final ReducibleAnnotationData myData){
- //For the raw data here, we're only keeping track of the sum of the squares of our values
- //When we go to reduce, we'll use the AD info to get the number of reads
-
- //must use likelihoods for allele-specific annotations
- if (likelihoods == null) {
- return;
- }
- getRMSDataFromLikelihoods(likelihoods, myData);
- }
@Override
public List getKeyNames() { return Arrays.asList(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY); }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
index f1c39989edb..1ba25693870 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
@@ -23,13 +23,6 @@ public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnn
public static final String RAW_DELIM = ",";
public static final String REDUCED_DELIM = ",";
- @Override
- public Map annotate(final ReferenceContext ref,
- final VariantContext vc,
- final ReadLikelihoods likelihoods) {
- return annotateRawData(ref, vc, likelihoods);
- }
-
/**
* Generates an annotation by calling the client implementation of getElementForRead(GATKRead read) over each read
* given its best assigned allele and returns the value of the allele as a double. This data gets condensed into a
@@ -45,7 +38,7 @@ public Map annotate(final ReferenceContext ref,
public Map annotateRawData(final ReferenceContext ref,
final VariantContext vc,
final ReadLikelihoods likelihoods ) {
- if ( likelihoods == null) {
+ if ( likelihoods == null || !likelihoods.hasFilledLiklihoods()) {
return Collections.emptyMap();
}
@@ -112,13 +105,10 @@ protected String makeRawAnnotationString(final List vcAlleles, final Map
// Generates as CompressedDataList over integer values over each read
@SuppressWarnings({"unchecked", "rawtypes"})//FIXME generics here blow up
- public void calculateRawData(VariantContext vc, final ReadLikelihoods likelihoods, ReducibleAnnotationData myData) {
+ private void calculateRawData(VariantContext vc, final ReadLikelihoods likelihoods, ReducibleAnnotationData myData) {
if( vc.getGenotypes().getSampleNames().size() != 1) {
throw new IllegalStateException("Calculating raw data for allele-specific rank sums requires variant context input with exactly one sample, as in a gVCF.");
}
- if(likelihoods == null) {
- return;
- }
final int refLoc = vc.getStart();
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
index 552a026b256..274b25d63b9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
@@ -8,6 +8,7 @@
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.StrandBiasTest;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
@@ -36,13 +37,6 @@ public AS_StrandBiasTest(){
@Override
public String getRawKeyName() { return GATKVCFConstants.AS_SB_TABLE_KEY; }
- @Override
- public Map annotate(final ReferenceContext ref,
- final VariantContext vc,
- final ReadLikelihoods likelihoods) {
- return annotateRawData(ref, vc, likelihoods);
- }
-
/**
* Method which determines how the Strand Bias read direction allele data must be combined into a final annotation
* Must be overridden by client methods.
@@ -68,14 +62,14 @@ public Map annotateRawData(final ReferenceContext ref,
final ReadLikelihoods likelihoods ) {
//for allele-specific annotations we only call from HC and we only use likelihoods
- if ( likelihoods == null) {
+ if ( likelihoods == null || !likelihoods.hasFilledLiklihoods()) {
return Collections.emptyMap();
}
// calculate the annotation from the likelihoods
// likelihoods can come from HaplotypeCaller call to VariantAnnotatorEngine
final Map annotations = new HashMap<>();
final ReducibleAnnotationData> myData = new AlleleSpecificAnnotationData<>(vc.getAlleles(),null);
- calculateRawData(vc, likelihoods, myData);
+ getStrandCountsFromLikelihoodMap(vc, likelihoods, myData, MIN_COUNT);
final Map> perAlleleValues = myData.getAttributeMap();
final String annotationString = makeRawAnnotationString(vc.getAlleles(), perAlleleValues);
annotations.put(getRawKeyName(), annotationString);
@@ -201,18 +195,6 @@ protected void parseRawDataString(ReducibleAnnotationData> myData)
myData.setAttributeMap(perAlleleValues);
}
-
- @SuppressWarnings({"unchecked", "rawtypes"})//FIXME
- public void calculateRawData(final VariantContext vc,
- final ReadLikelihoods likelihoods,
- final ReducibleAnnotationData rawAnnotations) {
- if(likelihoods == null) {
- return;
- }
-
- getStrandCountsFromLikelihoodMap(vc, likelihoods, rawAnnotations, MIN_COUNT);
- }
-
/**
Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
* fw rc
@@ -308,6 +290,13 @@ protected Map calculateAnnotationFromGTfield(final GenotypesCont
return Collections.emptyMap();
}
+ @Override
+ //Allele-specific annotations cannot be called from walkers other than HaplotypeCaller
+ protected Map calculateAnnotationFromStratifiedContexts(final Map> stratifiedContexts,
+ final VariantContext vc){
+ return new HashMap<>();
+ }
+
public static String rawValueAsString(int[][] table) {
return table[0][0]+","+table[0][1]+ PRINT_DELIM +table[1][0]+","+table[1][1];
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
index 093c6eb5786..68e90fee90b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
@@ -172,7 +172,7 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5
initializeActiveRegionEvaluationGenotyperEngine();
if (annotationEngine == null) {
- annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps);
+ annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps, emitReferenceConfidence());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
index d56672daac8..664748c8ea1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
@@ -114,7 +114,8 @@ private void initialize(final boolean createBamOutIndex, final boolean createBam
annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(MTAC.variantAnnotationArgumentCollection,
MTAC.dbsnp.dbsnp,
- MTAC.comps);
+ MTAC.comps,
+ false);
assemblyEngine = AssemblyBasedCallerUtils.createReadThreadingAssembler(MTAC);
likelihoodCalculationEngine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(MTAC.likelihoodArgs);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/StripAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/StripAnnotations.java
new file mode 100644
index 00000000000..eff31da2702
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/StripAnnotations.java
@@ -0,0 +1,4 @@
+package org.broadinstitute.hellbender.tools.walkers.variantutils;
+
+public class StripAnnotations {
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/genotyper/ReadLikelihoods.java b/src/main/java/org/broadinstitute/hellbender/utils/genotyper/ReadLikelihoods.java
index 19b0a099dc7..63fc77054cc 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/genotyper/ReadLikelihoods.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/genotyper/ReadLikelihoods.java
@@ -8,10 +8,13 @@
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.math3.stat.descriptive.rank.Median;
+import org.broadinstitute.hellbender.tools.walkers.qc.Pileup;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.downsampling.AlleleBiasedDownsamplingUtils;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
+import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import java.util.*;
@@ -27,7 +30,7 @@
*
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
*/
-public final class ReadLikelihoods implements SampleList, AlleleList {
+public class ReadLikelihoods implements SampleList, AlleleList {
/**
* Index indicaintg that the reference allele is missing.
@@ -37,7 +40,7 @@ public final class ReadLikelihoods implements SampleList, Alle
/**
* Reads by sample index. Each sub array contains reference to the reads of the ith sample.
*/
- private final GATKRead[][] readsBySampleIndex;
+ protected final GATKRead[][] readsBySampleIndex;
/**
* Indexed per sample, allele and finally read (within sample).
@@ -45,17 +48,17 @@ public final class ReadLikelihoods implements SampleList, Alle
* valuesBySampleIndex[s][a][r] == lnLk(R_r | A_a) where R_r comes from Sample s.
*