Skip to content

Commit

Permalink
Cleanup new GDB code
Browse files Browse the repository at this point in the history
  • Loading branch information
ldgauthier committed Oct 4, 2018
1 parent 13b47a5 commit 840bb25
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 172 deletions.
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package org.broadinstitute.hellbender.engine;

import com.netflix.servo.util.VisibleForTesting;
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.IOUtil;
import htsjdk.tribble.*;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.logging.log4j.LogManager;
Expand All @@ -20,26 +20,18 @@
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;

import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
import com.googlecode.protobuf.format.JsonFormat;
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;

import java.io.File;
import java.io.IOException;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Map;



/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -70,11 +62,6 @@
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);

/**
* identifies a path as a GenomicsDB URI
*/
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";

/**
* Feature reader used to retrieve records from our file
*/
Expand Down Expand Up @@ -290,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
this.queryLookaheadBases = queryLookaheadBases;
}

/**
* @param path String containing the path to test
* @return true if path represent a GenomicsDB URI, otherwise false
*/
public static boolean isGenomicsDBPath(final String path) {
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
}

@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
Expand Down Expand Up @@ -370,8 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}
}

@VisibleForTesting
public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
if (!isGenomicsDBPath(path)) {
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
}
Expand Down Expand Up @@ -407,150 +385,6 @@ public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final Str
}
}

private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
final File callsetJson, final File vidmapJson,
final File vcfHeader) {
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
.setWorkspace(workspace.getAbsolutePath())
.setReferenceGenome(reference.getAbsolutePath())
.setVidMappingFile(vidmapJson.getAbsolutePath())
.setCallsetMappingFile(callsetJson.getAbsolutePath())
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
.setProduceGTField(false)
.setProduceGTWithMinPLValueForSpanningDeletions(false)
.setSitesOnlyQuery(false)
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();

// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
// partition bounds.

// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
// 50, 50M), then only the second array is queried.

// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
// directory entries.
if (Files.exists(arrayFolder)) {
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
} else {
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
}

//Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
//API
//
//References
//GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
//Protobuf generated Java code guide:
//https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
//https://developers.google.com/protocol-buffers/docs/reference/java-generated

//Parse the vid json and create an in-memory Protobuf structure representing the
//information in the JSON file
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
try {
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
} catch (final IOException e) {
throw new UserException("Could not open vid json file " + vidmapJson, e);
}

//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
//We iterate over the list and create a field name to list index map
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);

vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum");


if (vidMapPB != null) {
//Use rebuilt vidMap in exportConfiguration
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
//in the JSON file
exportConfigurationBuilder.setVidMapping(vidMapPB);
}

return exportConfigurationBuilder.build();
}

/**
* Parse the vid json and create an in-memory Protobuf structure representing the
* information in the JSON file
*
* @param vidmapJson vid JSON file
* @return Protobuf object
*/
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
throws IOException {
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
try (final FileReader reader = new FileReader(vidmapJson)) {
JsonFormat.merge(reader, vidMapBuilder);
}
return vidMapBuilder.build();
}

/**
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
* We iterate over the list and create a field name to list index map
*
* @param vidMapPB Protobuf vid mapping object
* @return map from field name to index in vidMapPB.fields list
*/
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
}
return fieldNameToIndexInVidFieldsList;
}

/**
* Update vid Protobuf object with new combine operation for field
*
* @param vidMapPB input vid object
* @param fieldNameToIndexInVidFieldsList name to index in list
* @param fieldName INFO field name
* @param newCombineOperation combine op ("sum", "median")
* @return updated vid Protobuf object if field exists, else null
*/
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
final String fieldName,
final String newCombineOperation) {
final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
if (fieldIdx >= 0) {
//Would need to rebuild vidMapPB - so get top level builder first
final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
//To update the list element corresponding to fieldName, we get the builder for that specific list element
final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
//And update its combine operation
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);

//Shorter way of writing the same operation
/*
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
.setVCFFieldCombineOperation(newCombineOperation);
*/

//Rebuild full vidMap
return updatedVidMapBuilder.build();
}
return null;
}

/**
* Returns the sequence dictionary for this source of Features.
* Uses the dictionary from the VCF header (if present) for variant inputs,
Expand Down
Loading

0 comments on commit 840bb25

Please sign in to comment.