broadinstitute · ldgauthier · May 30, 2018 · Sep 28, 2018
diff --git a/build.gradle b/build.gradle
@@ -64,7 +64,7 @@ final sparkVersion = System.getProperty('spark.version', '2.2.0')
 final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
 final hadoopBamVersion = System.getProperty('hadoopBam.version','7.10.0')
 final tensorflowVersion = System.getProperty('tensorflow.version','1.4.0')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+ab5fbe92900259')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','0.10.0-proto-3.0.0-beta-1+bdce8be25b873')
 final testNGVersion = '6.11'
 // Using the shaded version to avoid conflicts between its protobuf dependency
 // and that of Hadoop/Spark (either the one we reference explicitly, or the one

diff --git a/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java b/src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java
@@ -1,7 +1,5 @@
 package org.broadinstitute.hellbender.engine;
 
-import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
-import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.IOUtil;
 import htsjdk.tribble.*;
@@ -22,6 +20,12 @@
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
 
+import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
+import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
+import com.googlecode.protobuf.format.JsonFormat;
+import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
+
 import java.io.File;
 import java.io.IOException;
 import java.nio.channels.SeekableByteChannel;
@@ -32,6 +36,10 @@
 import java.util.List;
 import java.util.Optional;
 import java.util.function.Function;
+import java.io.FileReader;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.Map;
 
 /**
  * Enables traversals and queries over sources of Features, which are metadata associated with a location
@@ -435,9 +443,108 @@ private static GenomicsDBExportConfiguration.ExportConfiguration createExportCon
             exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
         }
 
+        //Modify combine operations for INFO fields using the Protobuf API
+        //Parse the vid json and create an in-memory Protobuf structure representing the
+        //information in the JSON file
+        GenomicsDBVidMapProto.VidMappingPB vidMapPB = null;
+        try {
+            vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
+        }
+        catch (final IOException e) {
+            throw new UserException("Could not open vid json file "+vidmapJson, e);
+        }
+
+        //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
+        //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
+        //We iterate over the list and create a field name to list index map
+        HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
+                getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
+
+        //Update combine operations for GnarlyGenotyper
+        vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "MQ_DP", "sum");
+        vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "QUALapprox", "sum");
+        vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "VarDP", "sum");
+
+
+        vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum");
+
+        if(vidMapPB != null) {
+            //Use rebuilt vidMap in exportConfiguration
+            //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
+            //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
+            //in the JSON file
+            exportConfigurationBuilder.setVidMapping(vidMapPB);
+        }
+
+
         return exportConfigurationBuilder.build();
     }
 
+    /**
+     *  Parse the vid json and create an in-memory Protobuf structure representing the
+     *  information in the JSON file
+     *  @param vidmapJson vid JSON file
+     *  @return Protobuf object
+     */
+    public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
+            throws IOException {
+        GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
+        JsonFormat.merge(new FileReader(vidmapJson), vidMapBuilder);
+        return vidMapBuilder.build();
+    }
+
+    /**
+     * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
+     * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
+     * We iterate over the list and create a field name to list index map
+     * @param vidMapPB Protobuf vid mapping object
+     * @return map from field name to index in vidMapPB.fields list
+     */
+    public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
+            final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
+        HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<String, Integer>();
+        for(int fieldIdx=0;fieldIdx<vidMapPB.getFieldsCount();++fieldIdx)
+            fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
+        return fieldNameToIndexInVidFieldsList;
+    }
+
+    /**
+     * Update vid Protobuf object with new combine operation for field
+     * @param vidMapPB input vid object
+     * @param fieldNameToIndexInVidFieldsList name to index in list
+     * @param fieldName INFO field name
+     * @param newCombineOperation combine op ("sum", "median")
+     * @return updated vid Protobuf object if field exists, else null
+     */
+    public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
+            final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
+            final Map<String, Integer> fieldNameToIndexInVidFieldsList,
+            final String fieldName,
+            final String newCombineOperation)
+    {
+        int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
+                ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
+        if(fieldIdx >= 0) {
+            //Would need to rebuild vidMapPB - so get top level builder first
+            GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
+            //To update the list element corresponding to fieldName, we get the builder for that specific list element
+            GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
+                    updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
+            //And update its combine operation
+            fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);
+
+            //Shorter way of writing the same operation
+            /*
+            updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
+                .setVCFFieldCombineOperation(newCombineOperation);
+            */
+
+            //Rebuild full vidMap
+            return updatedVidMapBuilder.build();
+        }
+        return vidMapPB;
+    }
+
     /**
      * Returns the sequence dictionary for this source of Features.
      * Uses the dictionary from the VCF header (if present) for variant inputs,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FixCallSetSampleOrdering.java b/src/main/java/org/broadinstitute/hellbender/tools/FixCallSetSampleOrdering.java
@@ -28,6 +28,7 @@
 import java.nio.file.Path;
 import java.util.*;
 import java.util.stream.Collectors;
+import java.net.URI;
 
 @ExperimentalFeature
 @DocumentedFeature
@@ -80,7 +81,7 @@ public final class FixCallSetSampleOrdering extends VariantWalker {
 
     private VariantContextWriter writer;
 
-    private LinkedHashMap<String, Path> sampleNameMapFromGenomicsDBImport;
+    private LinkedHashMap<String, URI> sampleNameMapFromGenomicsDBImport;
 
     private Map<Path, String> gvcfToHeaderSampleMap;
 
@@ -134,7 +135,10 @@ private Map<Path, String> loadGvcfToHeaderSampleMap() {
         }
 
         Map<Path, String> mapping = new HashMap<>();
-        final Set<Path> gvcfPathsFromSampleNameMap = new HashSet<>(sampleNameMapFromGenomicsDBImport.values());
+        final Set<URI> gvcfURIsFromSampleNameMap = new HashSet<>(sampleNameMapFromGenomicsDBImport.values());
+        final Set<Path> gvcfPathsFromSampleNameMap = new HashSet<>();
+        for(final URI currEntry : gvcfURIsFromSampleNameMap)
+            gvcfPathsFromSampleNameMap.add(IOUtils.getPath(currEntry.toString()));
 
         try {
             final List<String> lines = Files.readAllLines(IOUtils.getPath(gvcfToHeaderSampleMapFile));
@@ -216,7 +220,7 @@ private List<String> getBatchSortedList() {
                 Map<String, String> sampleNameInHeaderToSampleNameInMap = new HashMap<>();
 
                 for ( final String batchSample : batch ) {
-                    final Path vcfPath = sampleNameMapFromGenomicsDBImport.get(batchSample);
+                    final Path vcfPath = IOUtils.getPath(sampleNameMapFromGenomicsDBImport.get(batchSample).toString());
                     if ( vcfPath == null ) {
                         throw new GATKException("Hash lookup failed for sample " + batchSample);
                     }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.hellbender.tools.genomicsdb;
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import com.intel.genomicsdb.GenomicsDBUtils;
 import com.intel.genomicsdb.importer.GenomicsDBImporter;
 import com.intel.genomicsdb.importer.model.ChromosomeInterval;
 import com.intel.genomicsdb.model.Coordinates;
@@ -44,7 +45,8 @@
 import java.util.concurrent.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
-
+import java.net.URI;
+import java.net.URISyntaxException;
 
 /**
  * Import single-sample GVCFs into GenomicsDB before joint genotyping.
@@ -285,7 +287,7 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
     // each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch
     // sorting.
     // We preemptively sort here so we will have consistent sorting.
-    private SortedMap<String, Path> sampleNameToVcfPath = new TreeMap<>();
+    private SortedMap<String, URI> sampleNameToVcfPath = new TreeMap<>();
 
     // Needed as smartMergeHeaders() returns a set of VCF header lines
     private Set<VCFHeaderLine> mergedHeaderLines = null;
@@ -345,10 +347,15 @@ private void initializeHeaderAndSampleMappings() {
                 headers.add(header);
 
                 final String sampleName = header.getGenotypeSamples().get(0);
-                final Path previousPath = sampleNameToVcfPath.put(sampleName, variantPath);
-                if (previousPath != null) {
-                    throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both "
-                                                    + variantPath.toUri() + " and " + previousPath.toUri() + ".");
+                try {
+                    final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString));
+                    if (previousPath != null) {
+                        throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both "
+                                + variantPath.toUri() + " and " + previousPath + ".");
+                    }
+                }
+                catch(URISyntaxException e) {
+                    throw new UserException("Malformed URI "+e.toString());
                 }
             }
             mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
@@ -361,7 +368,7 @@ private void initializeHeaderAndSampleMappings() {
             //the resulting database will have incorrect sample names
             //see https://github.com/broadinstitute/gatk/issues/3682 for more information
             sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile));
-            final Path firstHeaderPath = sampleNameToVcfPath.entrySet().iterator().next().getValue();
+            final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString());
             final VCFHeader header = getHeaderFromPath(firstHeaderPath);
             //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
             mergedHeaderLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder());
@@ -408,14 +415,14 @@ private static void assertGVCFHasOnlyOneSample(final String variantPath, final V
      * @param sampleToFileMapPath path to the mapping file
      * @return map of sample name to corresponding file, the map will be ordered according to the order in the input file
      */
-    public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampleToFileMapPath) {
+    public static LinkedHashMap<String, URI> loadSampleNameMapFile(final Path sampleToFileMapPath) {
         try {
             final List<String> lines = Files.readAllLines(sampleToFileMapPath);
             if (lines.isEmpty()) {
                 throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
             }
 
-            final LinkedHashMap<String, Path> sampleToFilename = new LinkedHashMap<>();
+            final LinkedHashMap<String, URI> sampleToFilename = new LinkedHashMap<>();
             for ( final String line : lines) {
                 final String[] split = line.split("\\t",-1);
                 if (split.length != 2) {
@@ -428,9 +435,14 @@ public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampl
                 }
                 final String sample = split[0];
                 final String path = split[1].trim();
-                final Path oldPath = sampleToFilename.put(sample, IOUtils.getPath(path));
-                if (oldPath != null){
-                    throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath.toUri() );
+                try {
+                    final URI oldPath = sampleToFilename.put(sample, new URI(path));
+                    if (oldPath != null){
+                        throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath );
+                    }
+                }
+                catch(final URISyntaxException e) {
+                    throw new UserException("Malformed URI "+e.toString());
                 }
             }
             return sampleToFilename;
@@ -451,7 +463,7 @@ public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampl
      * @param sampleToFileMapPath path to the mapping file
      * @return map of sample name to corresponding file, sorted by sample name
      */
-    public static SortedMap<String, Path> loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath){
+    public static SortedMap<String, URI> loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath){
         return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath));
     }
 
@@ -489,10 +501,10 @@ private void initializeInputPreloadExecutorService() {
     }
 
     private Map<String, FeatureReader<VariantContext>> createSampleToReaderMap(
-            final Map<String, Path> sampleNameToVcfPath, final int batchSize, final int index) {
+            final Map<String, URI> sampleNameToVcfPath, final int batchSize, final int index) {
         // TODO: fix casting since it's really ugly
         return inputPreloadExecutorService != null ?
-                getFeatureReadersInParallel((SortedMap<String, Path>) sampleNameToVcfPath, batchSize, index)
+                getFeatureReadersInParallel((SortedMap<String, URI>) sampleNameToVcfPath, batchSize, index)
                 : getFeatureReadersSerially(sampleNameToVcfPath, batchSize, index);
     }
 
@@ -585,15 +597,15 @@ public Object onTraversalSuccess() {
      * @return  Feature readers to be imported in the current batch, sorted by sample name
      */
     private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersInParallel(
-            final SortedMap<String, Path> sampleNametoPath, final int batchSize, final int lowerSampleIndex) {
+            final SortedMap<String, URI> sampleNametoPath, final int batchSize, final int lowerSampleIndex) {
         final SortedMap<String, FeatureReader<VariantContext>> sampleToReaderMap = new TreeMap<>();
         logger.info("Starting batch input file preload");
         final Map<String, Future<FeatureReader<VariantContext>>> futures = new LinkedHashMap<>();
         final List<String> sampleNames = new ArrayList<>(sampleNametoPath.keySet());
         for(int i = lowerSampleIndex; i < sampleNametoPath.size() && i < lowerSampleIndex+batchSize; ++i) {
             final String sampleName = sampleNames.get(i);
             futures.put(sampleName, inputPreloadExecutorService.submit(() -> {
-                final Path variantPath = sampleNametoPath.get(sampleName);
+                final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString());
                 try {
                     return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0));
                 } catch (final IOException e) {
@@ -616,13 +628,14 @@ private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersInPara
         return sampleToReaderMap;
     }
 
-    private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersSerially(final Map<String, Path> sampleNameToPath,
+    private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersSerially(final Map<String, URI> sampleNameToPath,
                                                                                  final int batchSize, final int lowerSampleIndex){
         final SortedMap<String, FeatureReader<VariantContext>> sampleToReaderMap = new TreeMap<>();
         final List<String> sampleNames = new ArrayList<>(sampleNameToPath.keySet());
         for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) {
             final String sampleName = sampleNames.get(i);
-            final AbstractFeatureReader<VariantContext, LineIterator> reader = getReaderFromPath(sampleNameToPath.get(sampleName));
+            final AbstractFeatureReader<VariantContext, LineIterator> reader = getReaderFromPath(IOUtils.getPath(
+                    sampleNameToPath.get(sampleName).toString()));
             sampleToReaderMap.put(sampleName, reader);
         }
         logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples");
@@ -661,7 +674,7 @@ private File overwriteOrCreateWorkspace() {
         }
 
         if (!workspaceDir.exists()) {
-            final int ret = GenomicsDBImporter.createTileDBWorkspace(workspaceDir.getAbsolutePath());
+            final int ret = GenomicsDBUtils.createTileDBWorkspace(workspaceDir.getAbsolutePath(), false);
             if (ret > 0) {
                 checkIfValidWorkspace(workspaceDir);
                 logger.info("Importing data to GenomicsDB workspace: " + workspaceDir);