Skip to content

Commit

Permalink
Port of GATK3 version of VariantAnnotator with the necessary changes …
Browse files Browse the repository at this point in the history
…to the annotation engine and client annotations to support annotating variants independant of the haplotype caller.
  • Loading branch information
jamesemery committed Nov 27, 2017
1 parent fe47cf6 commit 962c95e
Show file tree
Hide file tree
Showing 104 changed files with 4,645 additions and 129 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public final class WellformedReadFilter extends ReadFilter {
private ReadFilter wellFormedFilter = null;

// Command line parser requires a no-arg constructor
public WellformedReadFilter() {
public WellformedReadFilter() {
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ public static JavaRDD<VariantContext> callVariantsWithHaplotypeCaller(
final Broadcast<ReferenceMultiSource> referenceBroadcast = ctx.broadcast(reference);
final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);

final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps);
final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF);
final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast = ctx.broadcast(variantAnnotatorEngine);

final List<ShardBoundary> shardBoundaries = getShardBoundaries(header, intervals, shardingArgs.readShardSize, shardingArgs.readShardPadding);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public void onTraversalStart() {

final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder?

annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList());
annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList(), false);

// We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations.
genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -30,4 +37,99 @@ public static String encodeStringList( final List<String> stringList) {
return StringUtils.join(stringList, ",");
}

/**
* Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends
* Used by ReadPosRankSum annotations
*
* @param read a read containing the variant
* @param initialReadPosition the position based on the modified, post-hard-clipped CIGAR
* @return read position
*/
public static int getFinalVariantReadPosition(final GATKRead read, final int initialReadPosition) {
final int numAlignedBases = getNumAlignedBases(read);

int readPos = initialReadPosition;
//TODO: this doesn't work for the middle-right position if we index from zero
if (initialReadPosition > numAlignedBases / 2) {
readPos = numAlignedBases - (initialReadPosition + 1);
}
return readPos;

}

/**
*
* @param read a read containing the variant
* @return the number of hard clipped and low qual bases at the read start (where start is the leftmost end w.r.t. the reference)
*/
public static int getNumClippedBasesAtStart(final GATKRead read) {
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
final CigarElement first = c.getCigarElement(0);

int numStartClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartClippedBases = first.getLength();
}
final byte[] unclippedReadBases = read.getBases();
final byte[] unclippedReadQuals = read.getBaseQualities();

// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
//TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
numStartClippedBases++;
} else {
break;
}
}

return numStartClippedBases;
}


/**
*
* @param read a read containing the variant
* @return number of non-hard clipped, aligned bases (excluding low quality bases at either end)
*/
//TODO: this is bizarre -- this code counts hard clips, but then subtracts them from the read length, which already doesn't count hard clips
public static int getNumAlignedBases(final GATKRead read) {
return read.getLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
}


/**
*
* @param read a read containing the variant
* @return number of hard clipped and low qual bases at the read end (where end is right end w.r.t. the reference)
*/
public static int getNumClippedBasesAtEnd(final GATKRead read) {
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
CigarElement last = c.getCigarElement(c.numCigarElements() - 1);

int numEndClippedBases = 0;
if (last.getOperator() == CigarOperator.H) {
numEndClippedBases = last.getLength();
}
final byte[] unclippedReadBases = read.getBases();
final byte[] unclippedReadQuals = read.getBaseQualities();

// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
//TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output

numEndClippedBases++;
} else {
break;
}
}

return numEndClippedBases;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -56,6 +57,43 @@ public void annotate(final ReferenceContext ref,
// make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a subset of ReadLikelihoods alleles " + likelihoods.alleles());

int[] counts;
if (likelihoods.hasFilledLiklihoods()) {
counts = annotateWithLiklihoods(vc, g, alleles, likelihoods);
} else if (likelihoods.readCount()==0) {
return;
} else if (vc.isSNP()) {
counts = annotateWithPileup(vc, likelihoods.getStratifiedPileups(vc).get(g.getSampleName()));
} else {
counts = new int[vc.getNAlleles()];
}

gb.AD(counts);
}

private int[] annotateWithPileup(final VariantContext vc, List<PileupElement> pileupElements) {

final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
for ( final Allele allele : vc.getAlleles() ) {
alleleCounts.put(allele.getBases()[0], 0);
}
for ( final PileupElement p : pileupElements) {
if ( alleleCounts.containsKey(p.getBase()) ) {
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
}
}

// we need to add counts in the correct order
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
}
return counts;
}

private int[] annotateWithLiklihoods(VariantContext vc, Genotype g, Set<Allele> alleles, ReadLikelihoods<Allele> likelihoods) {

final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
for ( final Allele allele : vc.getAlleles() ) {
alleleCounts.put(allele, 0);
Expand All @@ -72,7 +110,7 @@ public void annotate(final ReferenceContext ref,
counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
}

gb.AD(counts);
return counts;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

Expand Down Expand Up @@ -44,6 +45,7 @@ public final class FisherStrand extends StrandBiasTest implements StandardAnnota

static final double MIN_PVALUE = 1E-320;
private static final int MIN_COUNT = ARRAY_DIM;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;

// how large do we want the normalized table to be? (ie, sum of all entries must be smaller that this)
private static final double TARGET_TABLE_SIZE = 200.0;
Expand All @@ -59,6 +61,14 @@ protected Map<String, Object> calculateAnnotationFromGTfield(final GenotypesCont
return ( tableFromPerSampleAnnotations != null )? annotationForOneTable(pValueForContingencyTable(tableFromPerSampleAnnotations)) : null;
}

@Override
protected Map<String, Object> calculateAnnotationFromStratifiedContexts(final Map<String, List<PileupElement>> stratifiedContexts,
final VariantContext vc){
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
return annotationForOneTable(Math.max(pValueForContingencyTable(tableFiltering), pValueForContingencyTable(tableNoFiltering)));
}

@Override
protected Map<String, Object> calculateAnnotationFromLikelihoods(final ReadLikelihoods<Allele> likelihoods,
final VariantContext vc){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,11 @@ public Map<String, Object> annotate(final ReferenceContext ref,
ADrestrictedDepth += totalADdepth;
}
depth += totalADdepth;
continue;
}
} else if (likelihoods != null) {
}

if (likelihoods != null) {
depth += likelihoods.sampleReadCount(likelihoods.indexOfSample(genotype.getSampleName()));
} else if ( genotype.hasDP() ) {
depth += genotype.getDP();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,21 +114,21 @@ public Map<String, Object> finalizeRawData(final VariantContext vc, final Varian
ReducibleAnnotationData myData = new ReducibleAnnotationData(rawMQdata);
parseRawDataString(myData);

String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc), myData.getAttributeMap());
return Collections.singletonMap(getKeyNames().get(0), (Object)annotationString);
}

public String makeFinalizedAnnotationString(final VariantContext vc, final Map<Allele, Number> perAlleleData) {
int numOfReads = getNumOfReads(vc);
public String makeFinalizedAnnotationString(final int numOfReads, final Map<Allele, Number> perAlleleData) {
return String.format("%.2f", Math.sqrt((double)perAlleleData.get(Allele.NO_CALL)/numOfReads));
}


public void combineAttributeMap(ReducibleAnnotationData<Number> toAdd, ReducibleAnnotationData<Number> combined) {
if (combined.getAttribute(Allele.NO_CALL) != null)
if (combined.getAttribute(Allele.NO_CALL) != null) {
combined.putAttribute(Allele.NO_CALL, (Double) combined.getAttribute(Allele.NO_CALL) + (Double) toAdd.getAttribute(Allele.NO_CALL));
else
} else {
combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
}

}

Expand All @@ -150,7 +150,17 @@ public void calculateRawData(final VariantContext vc,
public Map<String, Object> annotate(final ReferenceContext ref,
final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods) {
return annotateRawData(ref, vc, likelihoods);
Utils.nonNull(vc);
if (likelihoods == null || likelihoods.readCount() < 1 ) {
return new HashMap<>();
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<Number> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc, likelihoods), myData.getAttributeMap());
annotations.put(getKeyNames().get(0), annotationString);
return annotations;
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RankSumTest;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_ReadPosRankSumTest;
import org.broadinstitute.hellbender.utils.MannWhitneyU;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.*;
import java.util.stream.Collectors;


/**
* Abstract root for all RankSum based annotations
*/
public abstract class RankSumTest extends InfoFieldAnnotation {
public abstract class RankSumTest extends InfoFieldAnnotation implements Annotation {
protected static double INVALID_ELEMENT_FROM_READ = Double.NEGATIVE_INFINITY;
private boolean useDithering = true;

Expand Down Expand Up @@ -47,16 +50,32 @@ public Map<String, Object> annotate(final ReferenceContext ref,
final int refLoc = vc.getStart();

if( likelihoods != null) {
for (final ReadLikelihoods<Allele>.BestAllele bestAllele : likelihoods.bestAlleles()) {
final GATKRead read = bestAllele.read;
final Allele allele = bestAllele.allele;
if (bestAllele.isInformative() && isUsableRead(read, refLoc)) {
final OptionalDouble value = getElementForRead(read, refLoc, bestAllele);
// Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ) {
if (allele.isReference()) {
// Default to using the likelihoods to calculate the rank sum
if (likelihoods.hasFilledLiklihoods()) {
for (final ReadLikelihoods<Allele>.BestAllele bestAllele : likelihoods.bestAlleles()) {
final GATKRead read = bestAllele.read;
final Allele allele = bestAllele.allele;
if (bestAllele.isInformative() && isUsableRead(read, refLoc)) {
final OptionalDouble value = getElementForRead(read, refLoc, bestAllele);
// Bypass read if the clipping goal is not reached or the refloc is inside a spanning deletion
if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ) {
if (allele.isReference()) {
refQuals.add(value.getAsDouble());
} else if (vc.hasAllele(allele)) {
altQuals.add(value.getAsDouble());
}
}
}
}

// Use the pileup to stratify otherwise
} else if (!(this instanceof AS_RankSumTest)) {
for (final PileupElement p : likelihoods.getStratifiedPileups(vc).values().stream().flatMap(Collection::stream).collect(Collectors.toList())) {
final OptionalDouble value = getElementForPileupElement(p, refLoc);
if (value.isPresent() && value.getAsDouble() != INVALID_ELEMENT_FROM_READ && isUsableBase(p)) {
if (vc.getReference().equals(Allele.create(p.getBase(), true))) {
refQuals.add(value.getAsDouble());
} else if (vc.hasAllele(allele)) {
} else if (vc.hasAllele(Allele.create(p.getBase()))) {
altQuals.add(value.getAsDouble());
}
}
Expand Down Expand Up @@ -114,4 +133,33 @@ protected boolean isUsableRead(final GATKRead read, final int refLoc) {
Utils.nonNull(read);
return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE;
}

/**
* Can the base in this pileup element be used in comparative tests between ref / alt bases?
*
* Note that this function by default does not allow deletion pileup elements
*
* @param p the pileup element to consider
* @return true if this base is part of a meaningful read for comparison, false otherwise
*/
protected boolean isUsableBase(final PileupElement p) {
return !(p.isDeletion() ||
p.getMappingQual() == 0 ||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
}

/**
* Get the element for the given read at the given reference position
*
* By default this function returns null, indicating that the test doesn't support the old style of pileup calculations
*
* @param p the pileup element
* @return a Double representing the element to be used in the rank sum test, or null if it should not be used
*/
protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
// default to returning the same value
return getElementForRead(p.getRead(), refLoc);
}

}
Loading

0 comments on commit 962c95e

Please sign in to comment.