Port of GATK3 version of VariantAnnotator with the necessary changes …

…to the annotation engine and client annotations to support annotating variants independant of the haplotype caller.
broadinstitute · Nov 27, 2017 · 962c95e · 962c95e
1 parent fe47cf6
commit 962c95e
Show file tree

Hide file tree

Showing 104 changed files with 4,645 additions and 129 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/WellformedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/WellformedReadFilter.java
@@ -46,7 +46,7 @@ public final class WellformedReadFilter extends ReadFilter {
     private ReadFilter wellFormedFilter = null;
 
     // Command line parser requires a no-arg constructor
-    public WellformedReadFilter() {
+    public  WellformedReadFilter() {
     }
 
     @Override

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
@@ -207,7 +207,7 @@ public static JavaRDD<VariantContext> callVariantsWithHaplotypeCaller(
         final Broadcast<ReferenceMultiSource> referenceBroadcast = ctx.broadcast(reference);
         final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);
 
-        final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps);
+        final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF);
         final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast = ctx.broadcast(variantAnnotatorEngine);
 
         final List<ShardBoundary> shardBoundaries = getShardBoundaries(header, intervals, shardingArgs.readShardSize, shardingArgs.readShardPadding);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
@@ -148,7 +148,7 @@ public void onTraversalStart() {
 
         final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder?
 
-        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList());
+        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList(), false);
 
         // We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations.
         genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY));

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -1,6 +1,13 @@
 package org.broadinstitute.hellbender.tools.walkers.annotator;
 
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
 import org.apache.commons.lang.StringUtils;
+import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
+import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -30,4 +37,99 @@ public static String encodeStringList( final List<String> stringList) {
         return StringUtils.join(stringList, ",");
     }
 
+    /**
+     * Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends
+     * Used by ReadPosRankSum annotations
+     *
+     * @param read  a read containing the variant
+     * @param initialReadPosition   the position based on the modified, post-hard-clipped CIGAR
+     * @return read position
+     */
+    public static int getFinalVariantReadPosition(final GATKRead read, final int initialReadPosition) {
+        final int numAlignedBases = getNumAlignedBases(read);
+
+        int readPos = initialReadPosition;
+        //TODO: this doesn't work for the middle-right position if we index from zero
+        if (initialReadPosition > numAlignedBases / 2) {
+            readPos = numAlignedBases - (initialReadPosition + 1);
+        }
+        return readPos;
+
+    }
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  the number of hard clipped and low qual bases at the read start (where start is the leftmost end w.r.t. the reference)
+     */
+    public static int getNumClippedBasesAtStart(final GATKRead read) {
+        // check for hard clips (never consider these bases):
+        final Cigar c = read.getCigar();
+        final CigarElement first = c.getCigarElement(0);
+
+        int numStartClippedBases = 0;
+        if (first.getOperator() == CigarOperator.H) {
+            numStartClippedBases = first.getLength();
+        }
+        final byte[] unclippedReadBases = read.getBases();
+        final byte[] unclippedReadQuals = read.getBaseQualities();
+
+        // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+        // and may leave a string of Q2 bases still hanging off the reads.
+        //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+        for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
+            if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
+                numStartClippedBases++;
+            } else {
+                break;
+            }
+        }
+
+        return numStartClippedBases;
+    }
+
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  number of non-hard clipped, aligned bases (excluding low quality bases at either end)
+     */
+    //TODO: this is bizarre -- this code counts hard clips, but then subtracts them from the read length, which already doesn't count hard clips
+    public static int getNumAlignedBases(final GATKRead read) {
+        return read.getLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
+    }
+
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  number of hard clipped and low qual bases at the read end (where end is right end w.r.t. the reference)
+     */
+    public static int getNumClippedBasesAtEnd(final GATKRead read) {
+        // check for hard clips (never consider these bases):
+        final Cigar c = read.getCigar();
+        CigarElement last = c.getCigarElement(c.numCigarElements() - 1);
+
+        int numEndClippedBases = 0;
+        if (last.getOperator() == CigarOperator.H) {
+            numEndClippedBases = last.getLength();
+        }
+        final byte[] unclippedReadBases = read.getBases();
+        final byte[] unclippedReadQuals = read.getBaseQualities();
+
+        // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+        // and may leave a string of Q2 bases still hanging off the reads.
+        //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+        for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
+            if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
+
+                numEndClippedBases++;
+            } else {
+                break;
+            }
+        }
+
+        return numEndClippedBases;
+    }
+
 }
diff --git a/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java b/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java
@@ -12,6 +12,7 @@
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
 import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 
 import java.util.*;
 import java.util.stream.Collectors;
@@ -56,6 +57,43 @@ public void annotate(final ReferenceContext ref,
         // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
         Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a  subset of ReadLikelihoods alleles " + likelihoods.alleles());
 
+        int[] counts;
+        if (likelihoods.hasFilledLiklihoods()) {
+            counts = annotateWithLiklihoods(vc, g, alleles, likelihoods);
+        } else if (likelihoods.readCount()==0) {
+            return;
+        } else if (vc.isSNP()) {
+            counts = annotateWithPileup(vc, likelihoods.getStratifiedPileups(vc).get(g.getSampleName()));
+        } else {
+            counts = new int[vc.getNAlleles()];
+        }
+
+        gb.AD(counts);
+    }
+
+    private int[] annotateWithPileup(final VariantContext vc, List<PileupElement> pileupElements) {
+
+        final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
+        for ( final Allele allele : vc.getAlleles() ) {
+            alleleCounts.put(allele.getBases()[0], 0);
+        }
+        for ( final PileupElement p : pileupElements) {
+            if ( alleleCounts.containsKey(p.getBase()) ) {
+                alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
+            }
+        }
+
+        // we need to add counts in the correct order
+        final int[] counts = new int[alleleCounts.size()];
+        counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
+        for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
+            counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
+        }
+        return counts;
+    }
+
+    private int[] annotateWithLiklihoods(VariantContext vc, Genotype g, Set<Allele> alleles, ReadLikelihoods<Allele> likelihoods) {
+
         final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
         for ( final Allele allele : vc.getAlleles() ) {
             alleleCounts.put(allele, 0);
@@ -72,7 +110,7 @@ public void annotate(final ReferenceContext ref,
             counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
         }
 
-        gb.AD(counts);
+        return counts;
     }
 
     @Override

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java
@@ -10,6 +10,7 @@
 import org.broadinstitute.hellbender.utils.QualityUtils;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
 import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
 
@@ -44,6 +45,7 @@ public final class FisherStrand extends StrandBiasTest implements StandardAnnota
 
     static final double MIN_PVALUE = 1E-320;
     private static final int MIN_COUNT = ARRAY_DIM;
+    private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
 
     // how large do we want the normalized table to be? (ie, sum of all entries must be smaller that this)
     private static final double TARGET_TABLE_SIZE = 200.0;
@@ -59,6 +61,14 @@ protected Map<String, Object> calculateAnnotationFromGTfield(final GenotypesCont
         return ( tableFromPerSampleAnnotations != null )? annotationForOneTable(pValueForContingencyTable(tableFromPerSampleAnnotations)) : null;
     }
 
+    @Override
+    protected Map<String, Object> calculateAnnotationFromStratifiedContexts(final Map<String, List<PileupElement>> stratifiedContexts,
+                                                                            final VariantContext vc){
+        final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
+        final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
+        return annotationForOneTable(Math.max(pValueForContingencyTable(tableFiltering), pValueForContingencyTable(tableNoFiltering)));
+    }
+
     @Override
     protected Map<String, Object> calculateAnnotationFromLikelihoods(final ReadLikelihoods<Allele> likelihoods,
                                                                      final VariantContext vc){

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java
@@ -85,8 +85,11 @@ public Map<String, Object> annotate(final ReferenceContext ref,
                         ADrestrictedDepth += totalADdepth;
                     }
                     depth += totalADdepth;
+                    continue;
                 }
-            } else if (likelihoods != null) {
+            }
+
+            if (likelihoods != null) {
                 depth += likelihoods.sampleReadCount(likelihoods.indexOfSample(genotype.getSampleName()));
             } else if ( genotype.hasDP() ) {
                 depth += genotype.getDP();

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java
@@ -114,21 +114,21 @@ public Map<String, Object> finalizeRawData(final VariantContext vc, final Varian
         ReducibleAnnotationData myData = new ReducibleAnnotationData(rawMQdata);
         parseRawDataString(myData);
 
-        String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
+        String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc), myData.getAttributeMap());
         return Collections.singletonMap(getKeyNames().get(0), (Object)annotationString);
     }
 
-    public String makeFinalizedAnnotationString(final VariantContext vc, final Map<Allele, Number> perAlleleData) {
-        int numOfReads = getNumOfReads(vc);
+    public String makeFinalizedAnnotationString(final int numOfReads, final Map<Allele, Number> perAlleleData) {
         return String.format("%.2f", Math.sqrt((double)perAlleleData.get(Allele.NO_CALL)/numOfReads));
     }
 
 
     public void combineAttributeMap(ReducibleAnnotationData<Number> toAdd, ReducibleAnnotationData<Number> combined) {
-        if (combined.getAttribute(Allele.NO_CALL) != null)
+        if (combined.getAttribute(Allele.NO_CALL) != null) {
             combined.putAttribute(Allele.NO_CALL, (Double) combined.getAttribute(Allele.NO_CALL) + (Double) toAdd.getAttribute(Allele.NO_CALL));
-        else
+        } else {
             combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
+        }
 
     }
 
@@ -150,7 +150,17 @@ public void calculateRawData(final VariantContext vc,
     public Map<String, Object> annotate(final ReferenceContext ref,
                                         final VariantContext vc,
                                         final ReadLikelihoods<Allele> likelihoods) {
-        return annotateRawData(ref, vc, likelihoods);
+        Utils.nonNull(vc);
+        if (likelihoods == null || likelihoods.readCount() < 1 ) {
+            return new HashMap<>();
+        }
+
+        final Map<String, Object> annotations = new HashMap<>();
+        final ReducibleAnnotationData<Number> myData = new ReducibleAnnotationData<>(null);
+        calculateRawData(vc, likelihoods, myData);
+        final String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc, likelihoods), myData.getAttributeMap());
+        annotations.put(getKeyNames().get(0), annotationString);
+        return annotations;
     }
 
     @VisibleForTesting

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RankSumTest.java
@@ -4,21 +4,24 @@
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.GenotypesContext;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RankSumTest;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_ReadPosRankSumTest;
 import org.broadinstitute.hellbender.utils.MannWhitneyU;
 import org.broadinstitute.hellbender.utils.QualityUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 
 import java.util.*;
+import java.util.stream.Collectors;
 
 
 /**
  * Abstract root for all RankSum based annotations
  */
-public abstract class RankSumTest extends InfoFieldAnnotation {
+public abstract class RankSumTest extends InfoFieldAnnotation implements Annotation {
     protected static double INVALID_ELEMENT_FROM_READ = Double.NEGATIVE_INFINITY;
     private boolean useDithering = true;
 
@@ -47,16 +50,32 @@ public Map<String, Object> annotate(final ReferenceContext ref,
         final int refLoc = vc.getStart();
 
         if( likelihoods != null) {
-            for (final ReadLikelihoods<Allele>.BestAllele bestAllele : likelihoods.bestAlleles()) {
-                final GATKRead read = bestAllele.read;
-                final Allele allele = bestAllele.allele;
-                if (bestAllele.isInformative() && isUsableRead(read, refLoc)) {
-                    final OptionalDouble value = getElementForRead(read, refLoc, bestAllele);
-                    // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
-                    if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ) {
-                        if (allele.isReference()) {
+            // Default to using the likelihoods to calculate the rank sum
+            if (likelihoods.hasFilledLiklihoods()) {
+                for (final ReadLikelihoods<Allele>.BestAllele bestAllele : likelihoods.bestAlleles()) {
+                    final GATKRead read = bestAllele.read;
+                    final Allele allele = bestAllele.allele;
+                    if (bestAllele.isInformative() && isUsableRead(read, refLoc)) {
+                        final OptionalDouble value = getElementForRead(read, refLoc, bestAllele);
+                        // Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
+                        if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ) {
+                            if (allele.isReference()) {
+                                refQuals.add(value.getAsDouble());
+                            } else if (vc.hasAllele(allele)) {
+                                altQuals.add(value.getAsDouble());
+                            }
+                        }
+                    }
+                }
+
+            // Use the pileup to stratify otherwise
+            } else if (!(this instanceof AS_RankSumTest)) {
+                for (final PileupElement p : likelihoods.getStratifiedPileups(vc).values().stream().flatMap(Collection::stream).collect(Collectors.toList())) {
+                    final OptionalDouble value = getElementForPileupElement(p, refLoc);
+                    if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ && isUsableBase(p)) {
+                        if (vc.getReference().equals(Allele.create(p.getBase(), true))) {
                             refQuals.add(value.getAsDouble());
-                        } else if (vc.hasAllele(allele)) {
+                        } else if (vc.hasAllele(Allele.create(p.getBase()))) {
                             altQuals.add(value.getAsDouble());
                         }
                     }
@@ -114,4 +133,33 @@ protected boolean isUsableRead(final GATKRead read, final int refLoc) {
         Utils.nonNull(read);
         return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE;
     }
+
+    /**
+     * Can the base in this pileup element be used in comparative tests between ref / alt bases?
+     *
+     * Note that this function by default does not allow deletion pileup elements
+     *
+     * @param p the pileup element to consider
+     * @return true if this base is part of a meaningful read for comparison, false otherwise
+     */
+    protected boolean isUsableBase(final PileupElement p) {
+        return !(p.isDeletion() ||
+                p.getMappingQual() == 0 ||
+                p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
+                ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
+    }
+
+    /**
+     * Get the element for the given read at the given reference position
+     *
+     * By default this function returns null, indicating that the test doesn't support the old style of pileup calculations
+     *
+     * @param p        the pileup element
+     * @return a Double representing the element to be used in the rank sum test, or null if it should not be used
+     */
+    protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
+        // default to returning the same value
+        return getElementForRead(p.getRead(), refLoc);
+    }
+
 }