Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better ac… #5237

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ final sparkVersion = System.getProperty('spark.version', '2.2.0')
final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
final hadoopBamVersion = System.getProperty('hadoopBam.version','7.10.0')
final tensorflowVersion = System.getProperty('tensorflow.version','1.4.0')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.9.2-proto-3.0.0-beta-1+ab5fbe92900259')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.10.0-proto-3.0.0-beta-1+bdce8be25b873')
final testNGVersion = '6.11'
// Using the shaded version to avoid conflicts between its protobuf dependency
// and that of Hadoop/Spark (either the one we reference explicitly, or the one
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package org.broadinstitute.hellbender.engine;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.*;
Expand All @@ -22,6 +20,12 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import com.googlecode.protobuf.format.JsonFormat;
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
Expand All @@ -32,6 +36,10 @@
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.io.FileReader;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;

/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -435,9 +443,108 @@ private static GenomicsDBExportConfiguration.ExportConfiguration createExportCon
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

//Modify combine operations for INFO fields using the Protobuf API
//Parse the vid json and create an in-memory Protobuf structure representing the
//information in the JSON file
GenomicsDBVidMapProto.VidMappingPB vidMapPB = null;
try {
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
}
catch (final IOException e) {
throw new UserException("Could not open vid json file "+vidmapJson, e);
}

//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
//We iterate over the list and create a field name to list index map
HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

//Update combine operations for GnarlyGenotyper
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "MQ_DP", "sum");
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "QUALapprox", "sum");
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, "VarDP", "sum");


vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum");

if(vidMapPB != null) {
//Use rebuilt vidMap in exportConfiguration
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
//in the JSON file
exportConfigurationBuilder.setVidMapping(vidMapPB);
}


return exportConfigurationBuilder.build();
}

/**
* Parse the vid json and create an in-memory Protobuf structure representing the
* information in the JSON file
* @param vidmapJson vid JSON file
* @return Protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
throws IOException {
GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
JsonFormat.merge(new FileReader(vidmapJson), vidMapBuilder);
return vidMapBuilder.build();
}

/**
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
* We iterate over the list and create a field name to list index map
* @param vidMapPB Protobuf vid mapping object
* @return map from field name to index in vidMapPB.fields list
*/
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<String, Integer>();
for(int fieldIdx=0;fieldIdx<vidMapPB.getFieldsCount();++fieldIdx)
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
return fieldNameToIndexInVidFieldsList;
}

/**
* Update vid Protobuf object with new combine operation for field
* @param vidMapPB input vid object
* @param fieldNameToIndexInVidFieldsList name to index in list
* @param fieldName INFO field name
* @param newCombineOperation combine op ("sum", "median")
* @return updated vid Protobuf object if field exists, else null
*/
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
final String fieldName,
final String newCombineOperation)
{
int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
if(fieldIdx >= 0) {
//Would need to rebuild vidMapPB - so get top level builder first
GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
//To update the list element corresponding to fieldName, we get the builder for that specific list element
GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
//And update its combine operation
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);

//Shorter way of writing the same operation
/*
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
.setVCFFieldCombineOperation(newCombineOperation);
*/

//Rebuild full vidMap
return updatedVidMapBuilder.build();
}
return vidMapPB;
}

/**
* Returns the sequence dictionary for this source of Features.
* Uses the dictionary from the VCF header (if present) for variant inputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import java.net.URI;

@ExperimentalFeature
@DocumentedFeature
Expand Down Expand Up @@ -80,7 +81,7 @@ public final class FixCallSetSampleOrdering extends VariantWalker {

private VariantContextWriter writer;

private LinkedHashMap<String, Path> sampleNameMapFromGenomicsDBImport;
private LinkedHashMap<String, URI> sampleNameMapFromGenomicsDBImport;

private Map<Path, String> gvcfToHeaderSampleMap;

Expand Down Expand Up @@ -134,7 +135,10 @@ private Map<Path, String> loadGvcfToHeaderSampleMap() {
}

Map<Path, String> mapping = new HashMap<>();
final Set<Path> gvcfPathsFromSampleNameMap = new HashSet<>(sampleNameMapFromGenomicsDBImport.values());
final Set<URI> gvcfURIsFromSampleNameMap = new HashSet<>(sampleNameMapFromGenomicsDBImport.values());
final Set<Path> gvcfPathsFromSampleNameMap = new HashSet<>();
for(final URI currEntry : gvcfURIsFromSampleNameMap)
gvcfPathsFromSampleNameMap.add(IOUtils.getPath(currEntry.toString()));

try {
final List<String> lines = Files.readAllLines(IOUtils.getPath(gvcfToHeaderSampleMapFile));
Expand Down Expand Up @@ -216,7 +220,7 @@ private List<String> getBatchSortedList() {
Map<String, String> sampleNameInHeaderToSampleNameInMap = new HashMap<>();

for ( final String batchSample : batch ) {
final Path vcfPath = sampleNameMapFromGenomicsDBImport.get(batchSample);
final Path vcfPath = IOUtils.getPath(sampleNameMapFromGenomicsDBImport.get(batchSample).toString());
if ( vcfPath == null ) {
throw new GATKException("Hash lookup failed for sample " + batchSample);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.genomicsdb;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.intel.genomicsdb.GenomicsDBUtils;
import com.intel.genomicsdb.importer.GenomicsDBImporter;
import com.intel.genomicsdb.importer.model.ChromosomeInterval;
import com.intel.genomicsdb.model.Coordinates;
Expand Down Expand Up @@ -44,7 +45,8 @@
import java.util.concurrent.*;
import java.util.function.Function;
import java.util.stream.Collectors;

import java.net.URI;
import java.net.URISyntaxException;

/**
* Import single-sample GVCFs into GenomicsDB before joint genotyping.
Expand Down Expand Up @@ -285,7 +287,7 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
// each imported batch is then sorted, so if we have an unsorted list we'll end up with different global vs batch
// sorting.
// We preemptively sort here so we will have consistent sorting.
private SortedMap<String, Path> sampleNameToVcfPath = new TreeMap<>();
private SortedMap<String, URI> sampleNameToVcfPath = new TreeMap<>();

// Needed as smartMergeHeaders() returns a set of VCF header lines
private Set<VCFHeaderLine> mergedHeaderLines = null;
Expand Down Expand Up @@ -345,10 +347,15 @@ private void initializeHeaderAndSampleMappings() {
headers.add(header);

final String sampleName = header.getGenotypeSamples().get(0);
final Path previousPath = sampleNameToVcfPath.put(sampleName, variantPath);
if (previousPath != null) {
throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both "
+ variantPath.toUri() + " and " + previousPath.toUri() + ".");
try {
final URI previousPath = sampleNameToVcfPath.put(sampleName, new URI(variantPathString));
if (previousPath != null) {
throw new UserException("Duplicate sample: " + sampleName + ". Sample was found in both "
+ variantPath.toUri() + " and " + previousPath + ".");
}
}
catch(URISyntaxException e) {
throw new UserException("Malformed URI "+e.toString());
}
}
mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
Expand All @@ -361,7 +368,7 @@ private void initializeHeaderAndSampleMappings() {
//the resulting database will have incorrect sample names
//see https://github.com/broadinstitute/gatk/issues/3682 for more information
sampleNameToVcfPath = loadSampleNameMapFileInSortedOrder(IOUtils.getPath(sampleNameMapFile));
final Path firstHeaderPath = sampleNameToVcfPath.entrySet().iterator().next().getValue();
final Path firstHeaderPath = IOUtils.getPath(sampleNameToVcfPath.entrySet().iterator().next().getValue().toString());
final VCFHeader header = getHeaderFromPath(firstHeaderPath);
//getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
mergedHeaderLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder());
Expand Down Expand Up @@ -408,14 +415,14 @@ private static void assertGVCFHasOnlyOneSample(final String variantPath, final V
* @param sampleToFileMapPath path to the mapping file
* @return map of sample name to corresponding file, the map will be ordered according to the order in the input file
*/
public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampleToFileMapPath) {
public static LinkedHashMap<String, URI> loadSampleNameMapFile(final Path sampleToFileMapPath) {
try {
final List<String> lines = Files.readAllLines(sampleToFileMapPath);
if (lines.isEmpty()) {
throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
}

final LinkedHashMap<String, Path> sampleToFilename = new LinkedHashMap<>();
final LinkedHashMap<String, URI> sampleToFilename = new LinkedHashMap<>();
for ( final String line : lines) {
final String[] split = line.split("\\t",-1);
if (split.length != 2) {
Expand All @@ -428,9 +435,14 @@ public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampl
}
final String sample = split[0];
final String path = split[1].trim();
final Path oldPath = sampleToFilename.put(sample, IOUtils.getPath(path));
if (oldPath != null){
throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath.toUri() );
try {
final URI oldPath = sampleToFilename.put(sample, new URI(path));
if (oldPath != null){
throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + path + "\n" + oldPath );
}
}
catch(final URISyntaxException e) {
throw new UserException("Malformed URI "+e.toString());
}
}
return sampleToFilename;
Expand All @@ -451,7 +463,7 @@ public static LinkedHashMap<String, Path> loadSampleNameMapFile(final Path sampl
* @param sampleToFileMapPath path to the mapping file
* @return map of sample name to corresponding file, sorted by sample name
*/
public static SortedMap<String, Path> loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath){
public static SortedMap<String, URI> loadSampleNameMapFileInSortedOrder(final Path sampleToFileMapPath){
return new TreeMap<>(loadSampleNameMapFile(sampleToFileMapPath));
}

Expand Down Expand Up @@ -489,10 +501,10 @@ private void initializeInputPreloadExecutorService() {
}

private Map<String, FeatureReader<VariantContext>> createSampleToReaderMap(
final Map<String, Path> sampleNameToVcfPath, final int batchSize, final int index) {
final Map<String, URI> sampleNameToVcfPath, final int batchSize, final int index) {
// TODO: fix casting since it's really ugly
return inputPreloadExecutorService != null ?
getFeatureReadersInParallel((SortedMap<String, Path>) sampleNameToVcfPath, batchSize, index)
getFeatureReadersInParallel((SortedMap<String, URI>) sampleNameToVcfPath, batchSize, index)
: getFeatureReadersSerially(sampleNameToVcfPath, batchSize, index);
}

Expand Down Expand Up @@ -585,15 +597,15 @@ public Object onTraversalSuccess() {
* @return Feature readers to be imported in the current batch, sorted by sample name
*/
private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersInParallel(
final SortedMap<String, Path> sampleNametoPath, final int batchSize, final int lowerSampleIndex) {
final SortedMap<String, URI> sampleNametoPath, final int batchSize, final int lowerSampleIndex) {
final SortedMap<String, FeatureReader<VariantContext>> sampleToReaderMap = new TreeMap<>();
logger.info("Starting batch input file preload");
final Map<String, Future<FeatureReader<VariantContext>>> futures = new LinkedHashMap<>();
final List<String> sampleNames = new ArrayList<>(sampleNametoPath.keySet());
for(int i = lowerSampleIndex; i < sampleNametoPath.size() && i < lowerSampleIndex+batchSize; ++i) {
final String sampleName = sampleNames.get(i);
futures.put(sampleName, inputPreloadExecutorService.submit(() -> {
final Path variantPath = sampleNametoPath.get(sampleName);
final Path variantPath = IOUtils.getPath(sampleNametoPath.get(sampleName).toString());
try {
return new InitializedQueryWrapper(getReaderFromPath(variantPath), intervals.get(0));
} catch (final IOException e) {
Expand All @@ -616,13 +628,14 @@ private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersInPara
return sampleToReaderMap;
}

private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersSerially(final Map<String, Path> sampleNameToPath,
private SortedMap<String, FeatureReader<VariantContext>> getFeatureReadersSerially(final Map<String, URI> sampleNameToPath,
final int batchSize, final int lowerSampleIndex){
final SortedMap<String, FeatureReader<VariantContext>> sampleToReaderMap = new TreeMap<>();
final List<String> sampleNames = new ArrayList<>(sampleNameToPath.keySet());
for(int i = lowerSampleIndex; i < sampleNameToPath.size() && i < lowerSampleIndex+batchSize; ++i) {
final String sampleName = sampleNames.get(i);
final AbstractFeatureReader<VariantContext, LineIterator> reader = getReaderFromPath(sampleNameToPath.get(sampleName));
final AbstractFeatureReader<VariantContext, LineIterator> reader = getReaderFromPath(IOUtils.getPath(
sampleNameToPath.get(sampleName).toString()));
sampleToReaderMap.put(sampleName, reader);
}
logger.info("Importing batch " + this.batchCount + " with " + sampleToReaderMap.size() + " samples");
Expand Down Expand Up @@ -661,7 +674,7 @@ private File overwriteOrCreateWorkspace() {
}

if (!workspaceDir.exists()) {
final int ret = GenomicsDBImporter.createTileDBWorkspace(workspaceDir.getAbsolutePath());
final int ret = GenomicsDBUtils.createTileDBWorkspace(workspaceDir.getAbsolutePath(), false);
if (ret > 0) {
checkIfValidWorkspace(workspaceDir);
logger.info("Importing data to GenomicsDB workspace: " + workspaceDir);
Expand Down
Loading