From 349d4230ae6b2d4372dff6aa6b13eb895e8e01ec Mon Sep 17 00:00:00 2001 From: TedBrookings Date: Wed, 28 Nov 2018 11:07:39 -0500 Subject: [PATCH] Fix integer overflow bug in RMSMappingQuality (#5435) Updated RMS Mapping quality annotations to be of type long/Long instead of int/Integer. With int types, a large cohort could overflow Integer.MAX_VALUE when handling sum of squared mapping qualities. With long types this should not be a problem until we have off-world colonies. This resolves issue 5433. Attribute getters placed in GATKProtectedVariantContextUtils. --- .../walkers/annotator/RMSMappingQuality.java | 67 ++++++++++--------- .../GATKProtectedVariantContextUtils.java | 43 ++++++++++++ .../annotator/RMSMappingQualityUnitTest.java | 35 ++++++++++ 3 files changed, 113 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java index a998c1c3823..f3800d6d4d8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java @@ -25,8 +25,12 @@ import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import java.util.*; +import java.util.stream.Collectors; import java.util.stream.IntStream; +import static org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils.getAttributeAsLong; +import static org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils.getAttributeAsLongList; + /** * Root Mean Square of the mapping quality of reads across all samples. @@ -85,7 +89,7 @@ public Map annotateRawData(final ReferenceContext ref, } final Map annotations = new HashMap<>(); - final ReducibleAnnotationData> myData = new ReducibleAnnotationData<>(null); + final ReducibleAnnotationData> myData = new ReducibleAnnotationData<>(null); calculateRawData(vc, likelihoods, myData); final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap()); annotations.put(getRawKeyName(), annotationString); @@ -108,8 +112,8 @@ public Map combineRawData(final List vcAlleles, final Li return annotations; } - public String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleData) { - return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(0), perAlleleData.get(Allele.NO_CALL).get(1)); + private String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleData) { + return String.format("%d,%d", perAlleleData.get(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX), perAlleleData.get(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX)); } @Override @@ -127,17 +131,17 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) { "that may not have compatible annotations with this GATK version. Attempting to reformat MQ data."); final String rawMQdepth = vc.getAttributeAsString("MQ_DP",null); if (rawMQdepth == null) { - throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects an " + - "int tuple of sum of squared MQ values and total reads over variant genotypes."); + throw new UserException.BadInput("MQ annotation data is not properly formatted. This version expects a " + + "long tuple of sum of squared MQ values and total reads over variant genotypes."); } - rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to int + rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + rawMQdepth; //deprecated format was double so it needs to be converted to long } else { logger.warn("MQ annotation data is not properly formatted. This GATK version expects key " - + getRawKeyName() + " with an int tuple of sum of squared MQ values and total reads over variant " + + getRawKeyName() + " with an long tuple of sum of squared MQ values and total reads over variant " + "genotypes as the value. Attempting to use deprecated MQ calculation."); - final int numOfReads = getNumOfReads(vc, null); - rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to int + final long numOfReads = getNumOfReads(vc, null); + rawMQdata = Math.round(Double.parseDouble(rawMQdata)) + "," + numOfReads; //deprecated format was double so it needs to be converted to long } } else { @@ -147,43 +151,43 @@ else if (vc.hasAttribute(getDeprecatedRawKeyName())) { return new HashMap<>(); } - ReducibleAnnotationData> myData = new ReducibleAnnotationData(rawMQdata); + ReducibleAnnotationData> myData = new ReducibleAnnotationData(rawMQdata); parseRawDataString(myData); final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX)); return Collections.singletonMap(getKeyNames().get(0), annotationString); } - public String makeFinalizedAnnotationString(final int numOfReads, final int sumOfSquaredMQs) { + private String makeFinalizedAnnotationString(final long numOfReads, final long sumOfSquaredMQs) { return String.format("%.2f", Math.sqrt(sumOfSquaredMQs/(double)numOfReads)); } /** - * Combine the int tuples: sum of squared mapping quality and read counts over variant genotypes + * Combine the long tuples: sum of squared mapping quality and read counts over variant genotypes * Since this is not an allele-specific annotation, store the result with the NO_CALL allele key. * @param toAdd new data * @param combined passed in as previous data, modified to store the combined result */ - public void combineAttributeMap(ReducibleAnnotationData> toAdd, ReducibleAnnotationData> combined) { + private void combineAttributeMap(ReducibleAnnotationData> toAdd, ReducibleAnnotationData> combined) { if (combined.getAttribute(Allele.NO_CALL) != null) { - combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(0) + toAdd.getAttribute(Allele.NO_CALL).get(0), - combined.getAttribute(Allele.NO_CALL).get(1) + toAdd.getAttribute(Allele.NO_CALL).get(1))); + combined.putAttribute(Allele.NO_CALL, Arrays.asList(combined.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX), + combined.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX) + toAdd.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX))); } else { combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL)); } } @SuppressWarnings({"unchecked", "rawtypes"})//FIXME - public void calculateRawData(final VariantContext vc, + private void calculateRawData(final VariantContext vc, final ReadLikelihoods likelihoods, final ReducibleAnnotationData rawAnnotations){ - //GATK3.5 had a double, but change this to an int for the tuple representation (square sum, read count) - int squareSum = 0; - int numReadsUsed = 0; + //GATK3.5 had a double, but change this to an long for the tuple representation (square sum, read count) + long squareSum = 0; + long numReadsUsed = 0; for (int i = 0; i < likelihoods.numberOfSamples(); i++) { for (final GATKRead read : likelihoods.sampleReads(i)) { - int mq = read.getMappingQuality(); + long mq = read.getMappingQuality(); if (mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { squareSum += mq * mq; numReadsUsed++; @@ -203,7 +207,7 @@ public Map annotate(final ReferenceContext ref, } final Map annotations = new HashMap<>(); - final ReducibleAnnotationData> myData = new ReducibleAnnotationData<>(null); + final ReducibleAnnotationData> myData = new ReducibleAnnotationData<>(null); calculateRawData(vc, likelihoods, myData); final String annotationString = makeFinalizedAnnotationString(myData.getAttribute(Allele.NO_CALL).get(TOTAL_DEPTH_INDEX), myData.getAttribute(Allele.NO_CALL).get(SUM_OF_SQUARES_INDEX)); annotations.put(getKeyNames().get(0), annotationString); @@ -227,7 +231,7 @@ public VariantContext finalizeRawMQ(final VariantContext vc) { if (rawMQdata == null) { return vc; } else { - final List SSQMQandDP = parseRawDataString(rawMQdata); + final List SSQMQandDP = parseRawDataString(rawMQdata); final double rms = Math.sqrt(SSQMQandDP.get(SUM_OF_SQUARES_INDEX) / (double)SSQMQandDP.get(TOTAL_DEPTH_INDEX)); final String finalizedRMSMAppingQuality = formattedValue(rms); return new VariantContextBuilder(vc) @@ -237,26 +241,25 @@ public VariantContext finalizeRawMQ(final VariantContext vc) { } } - protected void parseRawDataString(ReducibleAnnotationData> myData) { + private void parseRawDataString(ReducibleAnnotationData> myData) { myData.putAttribute(Allele.NO_CALL, parseRawDataString(myData.getRawData())); } //TODO once the AS annotations have been added genotype gvcfs this can be removed for a more generic approach - private static List parseRawDataString(String rawDataString) { + private static List parseRawDataString(String rawDataString) { try { final String[] parsed = rawDataString.split(","); if (parsed.length != NUM_LIST_ENTRIES) { throw new UserException.BadInput("Raw value for annotation has " + parsed.length + " values, expected " + NUM_LIST_ENTRIES); } - final int squareSum = Integer.parseInt(parsed[SUM_OF_SQUARES_INDEX]); - final int totalDP = Integer.parseInt(parsed[TOTAL_DEPTH_INDEX]); + final long squareSum = Long.parseLong(parsed[SUM_OF_SQUARES_INDEX]); + final long totalDP = Long.parseLong(parsed[TOTAL_DEPTH_INDEX]); return Arrays.asList(squareSum,totalDP); } catch (final NumberFormatException e) { throw new UserException.BadInput("malformed " + GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY + " annotation: " + rawDataString); } } - /** * * @return the number of reads at the given site, trying first {@Link GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY}, @@ -266,24 +269,24 @@ private static List parseRawDataString(String rawDataString) { * @throws UserException.BadInput if the {@link VCFConstants#DEPTH_KEY} is missing or if the calculated depth is <= 0 */ @VisibleForTesting - static int getNumOfReads(final VariantContext vc, + static long getNumOfReads(final VariantContext vc, final ReadLikelihoods likelihoods) { if(vc.hasAttribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY)) { - List MQtuple = vc.getAttributeAsIntList(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0); + List MQtuple = getAttributeAsLongList(vc, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY,0L); if (MQtuple.get(TOTAL_DEPTH_INDEX) > 0) { return MQtuple.get(TOTAL_DEPTH_INDEX); } } - int numOfReads = 0; + long numOfReads = 0; if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) { - numOfReads = vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1); + numOfReads = getAttributeAsLong(vc, VCFConstants.DEPTH_KEY, -1L); if(vc.hasGenotypes()) { for(final Genotype gt : vc.getGenotypes()) { if(gt.isHomRef()) { //site-level DP contribution will come from MIN_DP for gVCF-called reference variants or DP for BP resolution if (gt.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)) { - numOfReads -= Integer.parseInt(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString()); + numOfReads -= Long.parseLong(gt.getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY).toString()); } else if (gt.hasDP()) { numOfReads -= gt.getDP(); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java index 35a5c2b86dd..805cdc307df 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java @@ -93,6 +93,48 @@ public static int[] getAttributeAsIntArray(final Genotype genotype, final String return attributeValueToIntArray(genotype.getExtendedAttribute(key), key, defaultValue, missingValue); } + + /** + * Get Long attribute from a variant context. + * + * @param variantContext the target variant-context. + * @param attribute the name of the attribute containing the Long value. + * @return never {@code null}. + * @throws IllegalArgumentException if {@code variantContext} is {@code null} or {@code key} is {@code null}. + */ + public static Long getAttributeAsLong(final VariantContext variantContext, final String attribute, final Long defaultValue) { + Utils.nonNull(variantContext); + Utils.nonNull(attribute); + Object x = variantContext.getAttribute(attribute); + if ( x == null || x.equals(VCFConstants.MISSING_VALUE_v4) ) return defaultValue; + if ( x instanceof Number ) return ((Number) x).longValue(); + return Long.valueOf((String)x); // throws an exception if this isn't a string + } + + /** + * Composes the Long List from a variant context. + * + * @param variantContext the target variant-context. + * @param attribute the name of the attribute containing the Long list. + * @return never {@code null}. + * @throws IllegalArgumentException if {@code variantContext} is {@code null} or {@code key} is {@code null}. + */ + public static List getAttributeAsLongList(final VariantContext variantContext, final String attribute, final Long defaultValue) { + Utils.nonNull(variantContext); + Utils.nonNull(attribute); + return variantContext.getAttributeAsList(attribute).stream().map( + x -> { + if (x == null || x.equals(VCFConstants.MISSING_VALUE_v4)) { + return defaultValue; + } else if (x instanceof Number) { + return ((Number) x).longValue(); + } else { + return Long.valueOf((String)x); // throws an exception if this isn't a string + } + } + ).collect(Collectors.toList()); + } + private static double[] attributeValueToDoubleArray(final Object value, final String key, final Supplier defaultResult, final double missingValue) { Utils.nonNull(key); final ToDoubleFunction doubleConverter = o -> { @@ -167,6 +209,7 @@ private static int[] attributeValueToIntArray(final Object value, final String k } } + /** * Returns an attribute as a string. * @param genotype the source genotype. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java index ff0f1b159cd..dd693126fd0 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQualityUnitTest.java @@ -215,8 +215,32 @@ public void testCombineAndFinalize() { Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 59.01); } + @Test + public void testCombineAndFinalizeHighMQSquared() { + final List vcAlleles = Arrays.asList(Allele.create("A", true), Allele.create("T", false)); + final List> combinedVCdata = new ArrayList<>(); + // Test that updating the annotation works when Integer.MAX_VALUE is exceeded, both for small and large updates: + combinedVCdata.add(new ReducibleAnnotationData<>("10125000000,5000000")); //5,000,000 MQ45 reads + combinedVCdata.add(new ReducibleAnnotationData<>("2601,1")); //1 MQ51 read + combinedVCdata.add(new ReducibleAnnotationData<>("1800000000,500000")); //500,000 MQ60 reads + + final RMSMappingQuality annotator = RMSMappingQuality.getInstance(); + + final Map combined = annotator.combineRawData(vcAlleles, combinedVCdata); + final String combinedListString = (String)combined.get(annotator.getRawKeyName()); + Assert.assertEquals(combinedListString, "11925002601,5500001"); + + final VariantContext vc = new VariantContextBuilder(makeVC()) + .attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, combinedListString) + .make(); + final VariantContext originalVC = null; + final Map output = new RMSMappingQuality().finalizeRawData(vc, originalVC); + Assert.assertEquals(Double.parseDouble((String)output.get("MQ")), 46.56); + } + @Test public void testFinalizeRawData(){ + // NOTE: RMSMappingQuality should ignore homRef depth of 20 and use only the 13 variants to calculate score. final VariantContext vc = new VariantContextBuilder(makeVC()) .attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "43732,13") .attribute(VCFConstants.DEPTH_KEY, 20) @@ -226,6 +250,17 @@ public void testFinalizeRawData(){ Assert.assertEquals(output.get("MQ"), "58.00"); } + @Test + public void testFinalizeHighMQSquaredRawData(){ + // Test that RMS Mapping Quality is correctly computed when Integer.MAX_VALUE is exceeded. + final VariantContext vc = new VariantContextBuilder(makeVC()) + .attribute(GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "3415207168,1749038") + .make(); + final VariantContext originalVC = null; + final Map output = new RMSMappingQuality().finalizeRawData(vc, originalVC); + Assert.assertEquals(output.get("MQ"), "44.19"); + } + @Test public void testFinalizeRawMQ(){ final VariantContext vc = new VariantContextBuilder(makeVC())