Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port of GATK3 Variant Annotator Tool #3803

Merged
merged 5 commits into from
Feb 26, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,97 +37,4 @@ public static String encodeStringList( final List<String> stringList) {
return StringUtils.join(stringList, ",");
}

/**
* Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends
* Used by ReadPosRankSum annotations
*
* @param read a read containing the variant
* @param initialReadPosition the position based on the modified, post-hard-clipped CIGAR
* @return read position
*/
public static int getFinalVariantReadPosition(final GATKRead read, final int initialReadPosition) {
final int numAlignedBases = getNumAlignedBases(read);

int readPos = initialReadPosition;
//TODO: this doesn't work for the middle-right position if we index from zero
if (initialReadPosition > numAlignedBases / 2) {
readPos = numAlignedBases - (initialReadPosition + 1);
}
return readPos;

}

/**
*
* @param read a read containing the variant
* @return the number of hard clipped and low qual bases at the read start (where start is the leftmost end w.r.t. the reference)
*/
public static int getNumClippedBasesAtStart(final GATKRead read) {
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
final CigarElement first = c.getCigarElement(0);

int numStartClippedBases = 0;
if (first.getOperator() == CigarOperator.H) {
numStartClippedBases = first.getLength();
}
final byte[] unclippedReadBases = read.getBases();
final byte[] unclippedReadQuals = read.getBaseQualities();

// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
//TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
numStartClippedBases++;
} else {
break;
}
}

return numStartClippedBases;
}


/**
*
* @param read a read containing the variant
* @return number of non-hard clipped, aligned bases (excluding low quality bases at either end)
*/
//TODO: this is bizarre -- this code counts hard clips, but then subtracts them from the read length, which already doesn't count hard clips
public static int getNumAlignedBases(final GATKRead read) {
return read.getLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
}

/**
*
* @param read a read containing the variant
* @return number of hard clipped and low qual bases at the read end (where end is right end w.r.t. the reference)
*/
public static int getNumClippedBasesAtEnd(final GATKRead read) {
// check for hard clips (never consider these bases):
final Cigar c = read.getCigar();
CigarElement last = c.getCigarElement(c.numCigarElements() - 1);

int numEndClippedBases = 0;
if (last.getOperator() == CigarOperator.H) {
numEndClippedBases = last.getLength();
}
final byte[] unclippedReadBases = read.getBases();
final byte[] unclippedReadQuals = read.getBaseQualities();

// Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
// and may leave a string of Q2 bases still hanging off the reads.
//TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output

numEndClippedBases++;
} else {
break;
}
}

return numEndClippedBases;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
Expand Down Expand Up @@ -41,4 +42,10 @@ public static OptionalDouble getReadBaseQuality(final GATKRead read, final int r
Utils.nonNull(read);
return OptionalDouble.of(read.getBaseQuality(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL)));
}

@Override
// When we have a pileupe element we only need its underlying read in order to com
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double typo! plileupE in order to com... pute?

protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
return OptionalDouble.of(p.getQual());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
Expand Down Expand Up @@ -36,4 +37,11 @@ protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc
Utils.nonNull(read);
return OptionalDouble.of(AlignmentUtils.getNumHardClippedBases(read));
}

@Override
protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
Utils.nonNull(p);
// default to returning the same value
return getElementForRead(p.getRead(),refLoc);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,16 @@ public void annotate(final ReferenceContext ref,

int[] counts;
if (likelihoods.hasFilledLikelihoods()) {
// Compute based on the alignment map
counts = annotateWithLikelihoods(vc, g, alleles, likelihoods);
} else if (likelihoods.readCount()==0) {
// No reads, thus cant compute the AD so do nothing
return;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this path is weird because all the other ones set AD to something, this just lets it continue being whatever it was before?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are no read likelihoods we don't add an AD to the genotype. I'm cool with that.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

commented

} else if (vc.isSNP()) {
// Compute based on pileup bases at the snp site (won't match haplotype caller output)
counts = annotateWithPileup(vc, likelihoods.getStratifiedPileups(vc).get(g.getSampleName()));
} else {
// Otherwise return an empty AD field for an indel.
counts = new int[vc.getNAlleles()];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the deal with the end case? If it's not a snp we just give up?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what we've traditionally done and it's also why we weren't going to support VariantAnnotator in GATK4. We expect that local realignment would change the indel ADs substantially compared with the pileup so we didn't even try.

}

Expand All @@ -78,15 +82,14 @@ private int[] annotateWithPileup(final VariantContext vc, List<PileupElement> pi
alleleCounts.put(allele.getBases()[0], 0);
}
for ( final PileupElement p : pileupElements) {
if ( alleleCounts.containsKey(p.getBase()) ) {
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
}
// only count bases that support alleles in the variant context
alleleCounts.computeIfPresent(p.getBase(), (base, count) -> count+ 1);
}

// we need to add counts in the correct order
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
for (int i = 0; i < vc.getNAlleles() -1; i++) {
counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
}
return counts;
Expand All @@ -106,7 +109,7 @@ private int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele>

final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference()); //first one in AD is always ref
for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
for (int i = 0; i < vc.getNAlleles() -1; i++) {
counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
Expand Down Expand Up @@ -44,8 +45,13 @@ protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc

@Override
protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc) {
Utils.nonNull(read);
//throw new IllegalStateException("This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overriden");
// todo its possible this should throw, as This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overriden
return OptionalDouble.empty();
}

@Override
protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
// todo its possible this should throw, as This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overriden
return OptionalDouble.empty();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
Expand Down Expand Up @@ -42,4 +43,9 @@ protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc
Utils.nonNull(read);
return OptionalDouble.of(read.getMappingQuality());
}

protected OptionalDouble getElementForPileupElement(final PileupElement p, final int refLoc) {
// default to returning the same value
return OptionalDouble.of(p.getMappingQual());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
continue;
}
}

// if there is no AD value or it is a dummy value, we want to look to other means to get the depth
if (likelihoods != null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was meant to become an if instead of an else if?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have a dummy AD value, (like NaN|NaN|Nan) then you probably still want to determine the depth of the sample by the likelihoods.

depth += likelihoods.sampleReadCount(likelihoods.indexOfSample(genotype.getSampleName()));
} else if ( genotype.hasDP() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,22 +114,21 @@ public Map<String, Object> finalizeRawData(final VariantContext vc, final Varian
ReducibleAnnotationData myData = new ReducibleAnnotationData(rawMQdata);
parseRawDataString(myData);

String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc), myData.getAttributeMap());
return Collections.singletonMap(getKeyNames().get(0), (Object)annotationString);
String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc, null), myData.getAttributeMap());
return Collections.singletonMap(getKeyNames().get(0), annotationString);
}

public String makeFinalizedAnnotationString(final int numOfReads, final Map<Allele, Number> perAlleleData) {
return String.format("%.2f", Math.sqrt((double)perAlleleData.get(Allele.NO_CALL)/numOfReads));
public String makeFinalizedAnnotationString(final int numOfReads, final Map<Allele, Double> perAlleleData) {
return String.format("%.2f", Math.sqrt(perAlleleData.get(Allele.NO_CALL)/numOfReads));
}


public void combineAttributeMap(ReducibleAnnotationData<Number> toAdd, ReducibleAnnotationData<Number> combined) {
public void combineAttributeMap(ReducibleAnnotationData<Double> toAdd, ReducibleAnnotationData<Double> combined) {
if (combined.getAttribute(Allele.NO_CALL) != null) {
combined.putAttribute(Allele.NO_CALL, (Double) combined.getAttribute(Allele.NO_CALL) + (Double) toAdd.getAttribute(Allele.NO_CALL));
combined.putAttribute(Allele.NO_CALL, combined.getAttribute(Allele.NO_CALL) + toAdd.getAttribute(Allele.NO_CALL));
} else {
combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
}

}

@SuppressWarnings({"unchecked", "rawtypes"})//FIXME
Expand All @@ -156,7 +155,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<Number> myData = new ReducibleAnnotationData<>(null);
final ReducibleAnnotationData<Double> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc, likelihoods), myData.getAttributeMap());
annotations.put(getKeyNames().get(0), annotationString);
Expand All @@ -180,7 +179,7 @@ public VariantContext finalizeRawMQ(final VariantContext vc) {
return vc;
} else {
final double squareSum = parseRawDataString(rawMQdata);
final int numOfReads = getNumOfReads(vc);
final int numOfReads = getNumOfReads(vc, null);
final double rms = Math.sqrt(squareSum / (double)numOfReads);
final String finalizedRMSMAppingQuality = formattedValue(rms);
return new VariantContextBuilder(vc)
Expand Down Expand Up @@ -208,38 +207,11 @@ private static double parseRawDataString(String rawDataString) {
*/
final double squareSum = Double.parseDouble(rawDataString.split(",")[0]);
return squareSum;
} catch (final NumberFormatException e){
throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY +" annotation: " + rawDataString);
} catch (final NumberFormatException e) {
throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY + " annotation: " + rawDataString);
}
}

/**
*
* @return the number of reads at the given site, calculated as InfoField {@link VCFConstants#DEPTH_KEY} minus the
* format field {@link GATKVCFConstants#MIN_DP_FORMAT_KEY} or DP of each of the HomRef genotypes at that site
* @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0
*/
@VisibleForTesting
static int getNumOfReads(final VariantContext vc) {
//don't use the full depth because we don't calculate MQ for reference blocks
int numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
if(vc.hasGenotypes()) {
for(final Genotype gt : vc.getGenotypes()) {
if(gt.isHomRef()) {
//site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
} else if (gt.hasDP()) {
numOfReads -= gt.getDP();
}
}
}
}
if (numOfReads <= 0){
numOfReads = -1; //return -1 to result in a NaN
}
return numOfReads;
}

/**
*
Expand All @@ -253,7 +225,21 @@ static int getNumOfReads(final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods) {
int numOfReads = 0;
if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) {
return getNumOfReads(vc);
numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
if(vc.hasGenotypes()) {
for(final Genotype gt : vc.getGenotypes()) {
if(gt.isHomRef()) {
//site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
} else if (gt.hasDP()) {
numOfReads -= gt.getDP();
}
}
}
}

// If there is no depth key, try to compute from the likelihoods
} else if (likelihoods != null && likelihoods.numberOfAlleles() != 0) {
for (int i = 0; i < likelihoods.numberOfSamples(); i++) {
for (GATKRead read : likelihoods.sampleReads(i)) {
Expand Down
Loading