Improve MQ calculation accuracy (#4969)

Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better accuracy where there are lots of uninformative reads or called single-sample variants with homRef genotypes. Note that incorporating this change into a pipeline will require a concomitant update to this version for GenomicsDBImport and GenotypeGVCFs.
broadinstitute · Oct 9, 2018 · 4dd7ba8 · 4dd7ba8
1 parent 158f7f7
commit 4dd7ba8
Show file tree

Hide file tree

Showing 39 changed files with 50,129 additions and 816 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java
@@ -1,10 +1,11 @@
 package org.broadinstitute.hellbender.engine;
 
+import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
+import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.IOUtil;
 import htsjdk.tribble.*;
 import htsjdk.variant.bcf2.BCF2Codec;
-import htsjdk.variant.variantcontext.GenotypeLikelihoods;
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.logging.log4j.LogManager;
@@ -19,25 +20,18 @@
 import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
-
-import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
-import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
-import com.googlecode.protobuf.format.JsonFormat;
-import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
+import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;
 
 import java.io.File;
 import java.io.IOException;
 import java.nio.channels.SeekableByteChannel;
-import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Optional;
 import java.util.function.Function;
-import java.io.FileReader;
-import java.util.HashMap;
-import java.util.Map;
+
+
 
 /**
  * Enables traversals and queries over sources of Features, which are metadata associated with a location
@@ -68,11 +62,6 @@
 public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
     private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
 
-    /**
-     * identifies a path as a GenomicsDB URI
-     */
-    public static final String GENOMIC_DB_URI_SCHEME = "gendb://";
-
     /**
      * Feature reader used to retrieve records from our file
      */
@@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
         this.queryLookaheadBases = queryLookaheadBases;
     }
 
-    /**
-     * @param path String containing the path to test
-     * @return true if path represent a GenomicsDB URI, otherwise false
-     */
-    public static boolean isGenomicsDBPath(final String path) {
-        return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
-    }
-
     @SuppressWarnings("unchecked")
     private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
                                                                          final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
@@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
         }
     }
 
-    private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
+    protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
         if (!isGenomicsDBPath(path)) {
             throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
         }
@@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
         }
     }
 
-    private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
-                                                                                               final File callsetJson, final File vidmapJson,
-                                                                                               final File vcfHeader) {
-        final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
-                GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
-                        .setWorkspace(workspace.getAbsolutePath())
-                        .setReferenceGenome(reference.getAbsolutePath())
-                        .setVidMappingFile(vidmapJson.getAbsolutePath())
-                        .setCallsetMappingFile(callsetJson.getAbsolutePath())
-                        .setVcfHeaderFilename(vcfHeader.getAbsolutePath())
-                        .setProduceGTField(false)
-                        .setProduceGTWithMinPLValueForSpanningDeletions(false)
-                        .setSitesOnlyQuery(false)
-                        .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
-        final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
-
-        // For the multi-interval support, we create multiple arrays (directories) in a single workspace -
-        // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
-        // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
-        // partition bounds.
-
-        // During the read phase, the user only supplies the workspace. The array names are obtained by scanning
-        // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
-        // 50, 50M), then only the second array is queried.
-
-        // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
-        // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
-        // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
-        // directory entries.
-        if (Files.exists(arrayFolder)) {
-            exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
-        } else {
-            exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
-        }
-
-        //Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
-        //API
-        //
-        //References
-        //GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
-        //Protobuf generated Java code guide:
-        //https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
-        //https://developers.google.com/protocol-buffers/docs/reference/java-generated
-
-        //Parse the vid json and create an in-memory Protobuf structure representing the
-        //information in the JSON file
-        GenomicsDBVidMapProto.VidMappingPB vidMapPB;
-        try {
-            vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
-        } catch (final IOException e) {
-            throw new UserException("Could not open vid json file " + vidmapJson, e);
-        }
-
-        //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
-        //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
-        //We iterate over the list and create a field name to list index map
-        final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
-                getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
-
-        //Example: set MQ combine operation to median (default is also median, but this is just an example)
-        vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
-                "MQ", "median");
-        if (vidMapPB != null) {
-            //Use rebuilt vidMap in exportConfiguration
-            //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
-            //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
-            //in the JSON file
-            exportConfigurationBuilder.setVidMapping(vidMapPB);
-        }
-
-        return exportConfigurationBuilder.build();
-    }
-
-    /**
-     * Parse the vid json and create an in-memory Protobuf structure representing the
-     * information in the JSON file
-     *
-     * @param vidmapJson vid JSON file
-     * @return Protobuf object
-     */
-    public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
-            throws IOException {
-        final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
-        try (final FileReader reader = new FileReader(vidmapJson)) {
-            JsonFormat.merge(reader, vidMapBuilder);
-        }
-        return vidMapBuilder.build();
-    }
-
-    /**
-     * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
-     * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
-     * We iterate over the list and create a field name to list index map
-     *
-     * @param vidMapPB Protobuf vid mapping object
-     * @return map from field name to index in vidMapPB.fields list
-     */
-    public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
-            final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
-        final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
-        for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
-            fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
-        }
-        return fieldNameToIndexInVidFieldsList;
-    }
-
-    /**
-     * Update vid Protobuf object with new combine operation for field
-     *
-     * @param vidMapPB                        input vid object
-     * @param fieldNameToIndexInVidFieldsList name to index in list
-     * @param fieldName                       INFO field name
-     * @param newCombineOperation             combine op ("sum", "median")
-     * @return updated vid Protobuf object if field exists, else null
-     */
-    public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
-            final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
-            final Map<String, Integer> fieldNameToIndexInVidFieldsList,
-            final String fieldName,
-            final String newCombineOperation) {
-        final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
-                ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
-        if (fieldIdx >= 0) {
-            //Would need to rebuild vidMapPB - so get top level builder first
-            final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
-            //To update the list element corresponding to fieldName, we get the builder for that specific list element
-            final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
-                    updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
-            //And update its combine operation
-            fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);
-
-            //Shorter way of writing the same operation
-            /*
-            updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
-                .setVCFFieldCombineOperation(newCombineOperation);
-            */
-
-            //Rebuild full vidMap
-            return updatedVidMapBuilder.build();
-        }
-        return null;
-    }
-
     /**
      * Returns the sequence dictionary for this source of Features.
      * Uses the dictionary from the VCF header (if present) for variant inputs,

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java
@@ -4,6 +4,7 @@
 import htsjdk.tribble.Feature;
 import htsjdk.tribble.FeatureCodec;
 import org.broadinstitute.barclay.argparser.CommandLineException;
+import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 
@@ -244,8 +245,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
      * creates a name from the given filePath by finding the absolute path of the given input
      */
     private static String makeIntoAbsolutePath(final String filePath){
-        if(FeatureDataSource.isGenomicsDBPath(filePath)){
-            return FeatureDataSource.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(FeatureDataSource.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
+        if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
+            return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
         } else if (URI.create(filePath).getScheme() != null) {
             return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
         } else {