Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve MQ calculation accuracy #4969

Merged
merged 7 commits into from
Oct 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package org.broadinstitute.hellbender.engine;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.*;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
Expand All @@ -19,25 +20,18 @@
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import com.googlecode.protobuf.format.JsonFormat;
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;

import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Map;



/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -68,11 +62,6 @@
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);

/**
* identifies a path as a GenomicsDB URI
*/
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";

/**
* Feature reader used to retrieve records from our file
*/
Expand Down Expand Up @@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
this.queryLookaheadBases = queryLookaheadBases;
}

/**
* @param path String containing the path to test
* @return true if path represent a GenomicsDB URI, otherwise false
*/
public static boolean isGenomicsDBPath(final String path) {
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
}

@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
Expand Down Expand Up @@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}
}

private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
if (!isGenomicsDBPath(path)) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
}
Expand Down Expand Up @@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
}
}

private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
final File callsetJson, final File vidmapJson,
final File vcfHeader) {
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
.setWorkspace(workspace.getAbsolutePath())
.setReferenceGenome(reference.getAbsolutePath())
.setVidMappingFile(vidmapJson.getAbsolutePath())
.setCallsetMappingFile(callsetJson.getAbsolutePath())
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
.setProduceGTField(false)
.setProduceGTWithMinPLValueForSpanningDeletions(false)
.setSitesOnlyQuery(false)
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();

// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
// partition bounds.

// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
// 50, 50M), then only the second array is queried.

// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
// directory entries.
if (Files.exists(arrayFolder)) {
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
} else {
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

//Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
//API
//
//References
//GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
//Protobuf generated Java code guide:
//https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
//https://developers.google.com/protocol-buffers/docs/reference/java-generated

//Parse the vid json and create an in-memory Protobuf structure representing the
//information in the JSON file
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
try {
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
} catch (final IOException e) {
throw new UserException("Could not open vid json file " + vidmapJson, e);
}

//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
//We iterate over the list and create a field name to list index map
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

//Example: set MQ combine operation to median (default is also median, but this is just an example)
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
"MQ", "median");
if (vidMapPB != null) {
//Use rebuilt vidMap in exportConfiguration
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
//in the JSON file
exportConfigurationBuilder.setVidMapping(vidMapPB);
}

return exportConfigurationBuilder.build();
}

/**
* Parse the vid json and create an in-memory Protobuf structure representing the
* information in the JSON file
*
* @param vidmapJson vid JSON file
* @return Protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
throws IOException {
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
try (final FileReader reader = new FileReader(vidmapJson)) {
JsonFormat.merge(reader, vidMapBuilder);
}
return vidMapBuilder.build();
}

/**
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
* We iterate over the list and create a field name to list index map
*
* @param vidMapPB Protobuf vid mapping object
* @return map from field name to index in vidMapPB.fields list
*/
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
}
return fieldNameToIndexInVidFieldsList;
}

/**
* Update vid Protobuf object with new combine operation for field
*
* @param vidMapPB input vid object
* @param fieldNameToIndexInVidFieldsList name to index in list
* @param fieldName INFO field name
* @param newCombineOperation combine op ("sum", "median")
* @return updated vid Protobuf object if field exists, else null
*/
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
final String fieldName,
final String newCombineOperation) {
final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
if (fieldIdx >= 0) {
//Would need to rebuild vidMapPB - so get top level builder first
final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
//To update the list element corresponding to fieldName, we get the builder for that specific list element
final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
//And update its combine operation
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);

//Shorter way of writing the same operation
/*
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
.setVCFFieldCombineOperation(newCombineOperation);
*/

//Rebuild full vidMap
return updatedVidMapBuilder.build();
}
return null;
}

/**
* Returns the sequence dictionary for this source of Features.
* Uses the dictionary from the VCF header (if present) for variant inputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;

Expand Down Expand Up @@ -244,8 +245,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
* creates a name from the given filePath by finding the absolute path of the given input
*/
private static String makeIntoAbsolutePath(final String filePath){
if(FeatureDataSource.isGenomicsDBPath(filePath)){
return FeatureDataSource.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(FeatureDataSource.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
} else if (URI.create(filePath).getScheme() != null) {
return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
} else {
Expand Down
Loading