Skip to content

Commit

Permalink
Fix integer overflow bug in RMSMappingQuality (#5435)
Browse files Browse the repository at this point in the history
Updated RMS Mapping quality annotations to be of type long/Long instead
of int/Integer. With int types, a large cohort could overflow
Integer.MAX_VALUE when handling sum of squared mapping qualities. With
long types this should not be a problem until we have off-world
colonies. This resolves issue 5433.

Attribute getters placed in GATKProtectedVariantContextUtils.
  • Loading branch information
TedBrookings authored Nov 28, 2018
1 parent 3941772 commit 349d423
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils.getAttributeAsLong;
import static org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils.getAttributeAsLongList;


/**
* Root Mean Square of the mapping quality of reads across all samples.
Expand Down Expand Up @@ -85,7 +89,7 @@ public Map<String, Object> annotateRawData(final ReferenceContext ref,
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData<>(null);
final ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
annotations.put(getRawKeyName(), annotationString);
Expand All @@ -108,8 +112,8 @@ public Map<String, Object> combineRawData(final List<Allele> vcAlleles, final Li
return annotations;
}

public String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Integer>> perAlleleData) {
return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(0), perAlleleData.get(Allele.NO_CALL).get(1));
private String makeRawAnnotationString(final List<Allele> vcAlleles, final Map<Allele, List<Long>> perAlleleData) {
return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX), perAlleleData.get(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX));
}

@Override
Expand All @@ -127,17 +131,17 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) {
"that may not have compatible annotations with this GATK version. Attempting to reformat MQ data.");
final String rawMQdepth = vc.getAttributeAsString("MQ_DP",null);
if (rawMQdepth == null) {
throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects an " +
"int tuple of sum of squared MQ values and total reads over variant genotypes.");
throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects a " +
"long tuple of sum of squared MQ values and total reads over variant genotypes.");
}
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to int
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to long
}
else {
logger.warn("MQ annotation data is not properly formatted. This GATK version expects key "
+ getRawKeyName() + " with an int tuple of sum of squared MQ values and total reads over variant "
+ getRawKeyName() + " with an long tuple of sum of squared MQ values and total reads over variant "
+ "genotypes as the value. Attempting to use deprecated MQ calculation.");
final int numOfReads = getNumOfReads(vc, null);
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to int
final long numOfReads = getNumOfReads(vc, null);
rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to long
}
}
else {
Expand All @@ -147,43 +151,43 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) {
return new HashMap<>();
}

ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData(rawMQdata);
ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData(rawMQdata);
parseRawDataString(myData);

final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX));
return Collections.singletonMap(getKeyNames().get(0), annotationString);
}

public String makeFinalizedAnnotationString(final int numOfReads, final int sumOfSquaredMQs) {
private String makeFinalizedAnnotationString(final long numOfReads, final long sumOfSquaredMQs) {
return String.format("%.2f", Math.sqrt(sumOfSquaredMQs/(double)numOfReads));
}


/**
* Combine the int tuples: sum of squared mapping quality and read counts over variant genotypes
* Combine the long tuples: sum of squared mapping quality and read counts over variant genotypes
* Since this is not an allele-specific annotation, store the result with the NO_CALL allele key.
* @param toAdd new data
* @param combined passed in as previous data, modified to store the combined result
*/
public void combineAttributeMap(ReducibleAnnotationData<List<Integer>> toAdd, ReducibleAnnotationData<List<Integer>> combined) {
private void combineAttributeMap(ReducibleAnnotationData<List<Long>> toAdd, ReducibleAnnotationData<List<Long>> combined) {
if (combined.getAttribute(Allele.NO_CALL) != null) {
combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(0) + toAdd.getAttribute(Allele.NO_CALL).get(0),
combined.getAttribute(Allele.NO_CALL).get(1) + toAdd.getAttribute(Allele.NO_CALL).get(1)));
combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX),
combined.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX)));
} else {
combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
}
}

@SuppressWarnings({"unchecked", "rawtypes"})//FIXME
public void calculateRawData(final VariantContext vc,
private void calculateRawData(final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods,
final ReducibleAnnotationData rawAnnotations){
//GATK3.5 had a double, but change this to an int for the tuple representation (square sum, read count)
int squareSum = 0;
int numReadsUsed = 0;
//GATK3.5 had a double, but change this to an long for the tuple representation (square sum, read count)
long squareSum = 0;
long numReadsUsed = 0;
for (int i = 0; i < likelihoods.numberOfSamples(); i++) {
for (final GATKRead read : likelihoods.sampleReads(i)) {
int mq = read.getMappingQuality();
long mq = read.getMappingQuality();
if (mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) {
squareSum += mq * mq;
numReadsUsed++;
Expand All @@ -203,7 +207,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
}

final Map<String, Object> annotations = new HashMap<>();
final ReducibleAnnotationData<List<Integer>> myData = new ReducibleAnnotationData<>(null);
final ReducibleAnnotationData<List<Long>> myData = new ReducibleAnnotationData<>(null);
calculateRawData(vc, likelihoods, myData);
final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX));
annotations.put(getKeyNames().get(0), annotationString);
Expand All @@ -227,7 +231,7 @@ public VariantContext finalizeRawMQ(final VariantContext vc) {
if (rawMQdata == null) {
return vc;
} else {
final List<Integer> SSQMQandDP = parseRawDataString(rawMQdata);
final List<Long> SSQMQandDP = parseRawDataString(rawMQdata);
final double rms = Math.sqrt(SSQMQandDP.get(SUM_OF_SQUARES_INDEX) / (double)SSQMQandDP.get(TOTAL_DEPTH_INDEX));
final String finalizedRMSMAppingQuality = formattedValue(rms);
return new VariantContextBuilder(vc)
Expand All @@ -237,26 +241,25 @@ public VariantContext finalizeRawMQ(final VariantContext vc) {
}
}

protected void parseRawDataString(ReducibleAnnotationData<List<Integer>> myData) {
private void parseRawDataString(ReducibleAnnotationData<List<Long>> myData) {
myData.putAttribute(Allele.NO_CALL, parseRawDataString(myData.getRawData()));
}

//TODO once the AS annotations have been added genotype gvcfs this can be removed for a more generic approach
private static List<Integer> parseRawDataString(String rawDataString) {
private static List<Long> parseRawDataString(String rawDataString) {
try {
final String[] parsed = rawDataString.split(",");
if (parsed.length != NUM_LIST_ENTRIES) {
throw new UserException.BadInput("Raw value for annotation has " + parsed.length + " values, expected " + NUM_LIST_ENTRIES);
}
final int squareSum = Integer.parseInt(parsed[SUM_OF_SQUARES_INDEX]);
final int totalDP = Integer.parseInt(parsed[TOTAL_DEPTH_INDEX]);
final long squareSum = Long.parseLong(parsed[SUM_OF_SQUARES_INDEX]);
final long totalDP = Long.parseLong(parsed[TOTAL_DEPTH_INDEX]);
return Arrays.asList(squareSum,totalDP);
} catch (final NumberFormatException e) {
throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY + " annotation: " + rawDataString);
}
}


/**
*
* @return the number of reads at the given site, trying first {@Link GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY},
Expand All @@ -266,24 +269,24 @@ private static List<Integer> parseRawDataString(String rawDataString) {
* @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0
*/
@VisibleForTesting
static int getNumOfReads(final VariantContext vc,
static long getNumOfReads(final VariantContext vc,
final ReadLikelihoods<Allele> likelihoods) {
if(vc.hasAttribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY)) {
List<Integer> MQtuple = vc.getAttributeAsIntList(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0);
List<Long> MQtuple = getAttributeAsLongList(vc, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0L);
if (MQtuple.get(TOTAL_DEPTH_INDEX) > 0) {
return MQtuple.get(TOTAL_DEPTH_INDEX);
}
}

int numOfReads = 0;
long numOfReads = 0;
if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) {
numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
numOfReads = getAttributeAsLong(vc, VCFConstants.DEPTH_KEY, -1L);
if(vc.hasGenotypes()) {
for(final Genotype gt : vc.getGenotypes()) {
if(gt.isHomRef()) {
//site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution
if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) {
numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
numOfReads -= Long.parseLong(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString());
} else if (gt.hasDP()) {
numOfReads -= gt.getDP();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,48 @@ public static int[] getAttributeAsIntArray(final Genotype genotype, final String
return attributeValueToIntArray(genotype.getExtendedAttribute(key), key, defaultValue, missingValue);
}


/**
* Get Long attribute from a variant context.
*
* @param variantContext the target variant-context.
* @param attribute the name of the attribute containing the Long value.
* @return never {@code null}.
* @throws IllegalArgumentException if {@code variantContext} is {@code null} or {@code key} is {@code null}.
*/
public static Long getAttributeAsLong(final VariantContext variantContext, final String attribute, final Long defaultValue) {
Utils.nonNull(variantContext);
Utils.nonNull(attribute);
Object x = variantContext.getAttribute(attribute);
if ( x == null || x.equals(VCFConstants.MISSING_VALUE_v4) ) return defaultValue;
if ( x instanceof Number ) return ((Number) x).longValue();
return Long.valueOf((String)x); // throws an exception if this isn't a string
}

/**
* Composes the Long List from a variant context.
*
* @param variantContext the target variant-context.
* @param attribute the name of the attribute containing the Long list.
* @return never {@code null}.
* @throws IllegalArgumentException if {@code variantContext} is {@code null} or {@code key} is {@code null}.
*/
public static List<Long> getAttributeAsLongList(final VariantContext variantContext, final String attribute, final Long defaultValue) {
Utils.nonNull(variantContext);
Utils.nonNull(attribute);
return variantContext.getAttributeAsList(attribute).stream().map(
x -> {
if (x == null || x.equals(VCFConstants.MISSING_VALUE_v4)) {
return defaultValue;
} else if (x instanceof Number) {
return ((Number) x).longValue();
} else {
return Long.valueOf((String)x); // throws an exception if this isn't a string
}
}
).collect(Collectors.toList());
}

private static double[] attributeValueToDoubleArray(final Object value, final String key, final Supplier<double[]> defaultResult, final double missingValue) {
Utils.nonNull(key);
final ToDoubleFunction<Object> doubleConverter = o -> {
Expand Down Expand Up @@ -167,6 +209,7 @@ private static int[] attributeValueToIntArray(final Object value, final String k
}
}


/**
* Returns an attribute as a string.
* @param genotype the source genotype.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,32 @@ public void testCombineAndFinalize() {
Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 59.01);
}

@Test
public void testCombineAndFinalizeHighMQSquared() {
final List<Allele> vcAlleles = Arrays.asList(Allele.create("A", true), Allele.create("T", false));
final List<ReducibleAnnotationData<?>> combinedVCdata = new ArrayList<>();
// Test that updating the annotation works when Integer.MAX_VALUE is exceeded, both for small and large updates:
combinedVCdata.add(new ReducibleAnnotationData<>("10125000000,5000000")); //5,000,000 MQ45 reads
combinedVCdata.add(new ReducibleAnnotationData<>("2601,1")); //1 MQ51 read
combinedVCdata.add(new ReducibleAnnotationData<>("1800000000,500000")); //500,000 MQ60 reads

final RMSMappingQuality annotator = RMSMappingQuality.getInstance();

final Map<String, Object> combined = annotator.combineRawData(vcAlleles, combinedVCdata);
final String combinedListString = (String)combined.get(annotator.getRawKeyName());
Assert.assertEquals(combinedListString, "11925002601,5500001");

final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, combinedListString)
.make();
final VariantContext originalVC = null;
final Map<String, Object> output = new RMSMappingQuality().finalizeRawData(vc, originalVC);
Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 46.56);
}

@Test
public void testFinalizeRawData(){
// NOTE: RMSMappingQuality should ignore homRef depth of 20 and use only the 13 variants to calculate score.
final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "43732,13")
.attribute(VCFConstants.DEPTH_KEY, 20)
Expand All @@ -226,6 +250,17 @@ public void testFinalizeRawData(){
Assert.assertEquals(output.get("MQ"), "58.00");
}

@Test
public void testFinalizeHighMQSquaredRawData(){
// Test that RMS Mapping Quality is correctly computed when Integer.MAX_VALUE is exceeded.
final VariantContext vc = new VariantContextBuilder(makeVC())
.attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "3415207168,1749038")
.make();
final VariantContext originalVC = null;
final Map<String, Object> output = new RMSMappingQuality().finalizeRawData(vc, originalVC);
Assert.assertEquals(output.get("MQ"), "44.19");
}

@Test
public void testFinalizeRawMQ(){
final VariantContext vc = new VariantContextBuilder(makeVC())
Expand Down

0 comments on commit 349d423

Please sign in to comment.