Skip to content

Commit

Permalink
GenomicsDBImport: add ability to specify explicit index locations via…
Browse files Browse the repository at this point in the history
… the sample name map file (#7967)

The sample name map file accepted by GenomicsDBImport can now optionally contain a third
column giving an explicit path to an index for the corresponding VCF. It is allowed to
specify an explicit index in some lines of the sample name map and not others.

Added comprehensive unit and integration tests.
  • Loading branch information
droazen authored Oct 11, 2022
1 parent fd78250 commit 19778c1
Show file tree
Hide file tree
Showing 7 changed files with 889 additions and 239 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import java.util.Optional;
import java.util.function.Function;

import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;
import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.createExportConfiguration;
import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION;

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.genomicsdb;

import com.googlecode.protobuf.format.JsonFormat;
import htsjdk.samtools.util.FileExtensions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.hellbender.utils.Utils;
Expand All @@ -12,6 +13,7 @@
import org.genomicsdb.model.GenomicsDBVidMapProto;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;

Expand All @@ -28,7 +30,7 @@
* https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
* https://developers.google.com/protocol-buffers/docs/reference/java-generated
*/
public class GenomicsDBUtils {
public class GATKGenomicsDBUtils {

private static final String SUM = "sum";
private static final String ELEMENT_WISE_SUM = "element_wise_sum";
Expand Down Expand Up @@ -338,5 +340,20 @@ public static String genomicsDBApppendPaths(String parentPath, String path) {
}
}

public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath) {
assertVariantFileIsCompressedAndIndexed(vcfPath, null);
}

public static void assertVariantFileIsCompressedAndIndexed(final Path vcfPath, final Path optionalVCFindexPath) {
if (!vcfPath.toString().toLowerCase().endsWith(FileExtensions.COMPRESSED_VCF)) {
throw new UserException("Input variant files must be block compressed vcfs when using " +
GenomicsDBImport.BYPASS_FEATURE_READER + ", but " + vcfPath.toString() + " does not end with " +
"the standard file extension " + FileExtensions.COMPRESSED_VCF);
}

Path indexPath = optionalVCFindexPath != null ?
optionalVCFindexPath :
vcfPath.resolveSibling(vcfPath.getFileName() + FileExtensions.COMPRESSED_VCF_INDEX);
IOUtils.assertFileIsReadable(indexPath);
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
package org.broadinstitute.hellbender.tools.genomicsdb;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

/**
* A class to hold the mappings of sample names to VCF / VCF index paths. Used by GenomicsDBImport.
*
* This class can be constructed from a textual file containing lines in the format:
*
* Sample\tVCF
* or:
* Sample\tVCF\tIndex
*
* The sample names may have internal whitespace, but not leading/trailing whitespace.
* The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
*
* The third Index column is optional. It is permitted to specify the index for some samples
* and not others. If an index is not specified for a sample, its location is inferred from
* the VCF URI.
*
* It is also possible to construct an empty SampleNameMap using the no-arg constructor, and
* add sample mappings one at a time using addSample().
*/
public final class SampleNameMap {
// Sorted mapping between sample names and corresponding GVCF file name
//
// IMPORTANT: This must be sorted or it will result in sample name swaps in the output database.
// This happens because the callset json is generated independently from the import process
// each imported batch is then sorted, so if we have an unsorted list we'll end up with different
// global vs batch sorting.
// We preemptively sort here so we will have consistent sorting.
private SortedMap<String, URI> sampleNameToVcfPath;

// Mapping between sample names and corresponding VCF index path
//
// This Map contains only indices specified explicitly via the sample name map file.
// If an explicit index is not specified for a given sample, it will not have an
// entry in this Map, and the index path will be automatically inferred based on
// the location of the VCF.
//
// The ordering of the entries in this Map does not actually matter, since it's not
// directly exposed, and is used only for individual lookups via getVCFIndexForSample()
private SortedMap<String, URI> sampleNameToVcfIndexPath;

/**
* Create an empty SampleNameMap. Samples can be added later using addSample()
*/
public SampleNameMap() {
sampleNameToVcfPath = new TreeMap<>();
sampleNameToVcfIndexPath = new TreeMap<>();
}

/**
* Create a SampleNameMap from a textual file containing the sample mappings. The
* lines in this file must be in the format:
*
* Sample\tVCF
* or:
* Sample\tVCF\tIndex
*
* The sample names may have internal whitespace, but not leading/trailing whitespace.
* The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
*
* The third Index column is optional. It is permitted to specify the index for some samples
* and not others. If an index is not specified for a sample, its location is inferred from
* the VCF URI.
*
* @param sampleMapFilePath Path to the file containing the sample name mappings to load
*/
public SampleNameMap(final Path sampleMapFilePath) {
this(sampleMapFilePath, false);
}

/**
* Create a SampleNameMap from a textual file containing the sample mappings. The
* lines in this file must be in the format:
*
* SampleName1\tVCF
* or:
* SampleName1\tVCF\tIndex
*
* The sample names may have internal whitespace, but not leading/trailing whitespace.
* The VCF and Index URIs may have leading/trailing whitespace, which is ignored.
*
* The third Index column is optional. It is permitted to specify the index for some samples
* and not others. If an index is not specified for a sample, its location is inferred from
* the VCF URI.
*
* @param sampleMapFilePath Path to the file containing the sample name mappings to load
* @param checkVcfIsCompressedAndIndexed If true, check each VCF to make sure it's compressed and indexed
*/
public SampleNameMap(final Path sampleMapFilePath, final boolean checkVcfIsCompressedAndIndexed) {
sampleNameToVcfPath = new TreeMap<>();
sampleNameToVcfIndexPath = new TreeMap<>();

loadSampleNameMapFile(sampleMapFilePath, checkVcfIsCompressedAndIndexed);
}

private void loadSampleNameMapFile(final Path sampleToFileMapPath, final boolean checkVcfIsCompressedAndIndexed) {
try {
final List<String> lines = Files.readAllLines(sampleToFileMapPath);
if (lines.isEmpty()) {
throw new UserException.BadInput( "At least 1 sample is required but none were found in the sample mapping file");
}

for (final String line : lines) {
final String[] split = line.split("\\t",-1);
if (split.length != 2 && split.length != 3) {
throw new UserException.BadInput("Sample name map file must have 2 or 3 fields per line in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\nbut found line: \""
+ line +"\" with "+split.length+" fields");
}
if ( ! sampleNameIsLegal(split[0]) || split[1].trim().isEmpty()) {
throw new UserException.BadInput("Sample name map file must have lines in the format:\nSample\tFile\nor:\nSample\tFile\tIndex\n but found line: '" + line + "'\nValid sample names must be non-empty strings that cannot begin or end with whitespace and valid file names must be non-empty and not all whitespace");
}
final String sample = split[0];
final String vcfPath = split[1].trim();

String vcfIndexPath = null;
if ( split.length == 3 ) {
vcfIndexPath = split[2].trim();

if ( vcfIndexPath.isEmpty() ) {
throw new UserException.BadInput("Found a line in the sample name map file with an empty or all-whitespace value for the index:\n" + "\"" + line + "\"");
}
}

try {
final URI existingVCFPath = sampleNameToVcfPath.put(sample, new URI(vcfPath));
if (existingVCFPath != null){
throw new UserException.BadInput("Found two mappings for the same sample: " + sample + "\n" + vcfPath + "\n" + existingVCFPath);
}

if ( vcfIndexPath != null ) {
final URI existingVCFIndexPath = sampleNameToVcfIndexPath.put(sample, new URI(vcfIndexPath));
if (existingVCFIndexPath != null) {
throw new UserException.BadInput("Found two indices for the same sample: " + sample + "\n" + vcfIndexPath + "\n" + existingVCFIndexPath);
}
}

if (checkVcfIsCompressedAndIndexed) {
GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(IOUtils.getPath(vcfPath), vcfIndexPath == null ? null : IOUtils.getPath(vcfIndexPath));
}
}
catch(final URISyntaxException e) {
throw new UserException("Malformed URI: " + e.toString());
}
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(sampleToFileMapPath, "exception while reading sample->filename mapping file", e);
}
}

/**
* Tests whether the sample name is legal. Sample names must be non-empty, and
* may have internal whitespace but not leading/trailing whitespace.
*
* @param sampleName sample name to test
* @return true if sampleName is legal, otherwise false
*/
private boolean sampleNameIsLegal(final String sampleName) {
return sampleName != null &&
! sampleName.trim().isEmpty() &&
sampleName.trim().equals(sampleName);
}

/**
* Add a new sample mapping
*
* @param sampleName name of the sample
* @param vcfPath path to the VCF for the sample
*/
public void addSample(final String sampleName, final URI vcfPath) {
addSample(sampleName, vcfPath, null);
}

/**
* Add a new sample mapping
*
* @param sampleName name of the sample
* @param vcfPath path to the VCF for the sample (not null)
* @param vcfIndexPath path to the index for the sample (may be null)
*/
public void addSample(final String sampleName, final URI vcfPath, final URI vcfIndexPath) {
if ( ! sampleNameIsLegal(sampleName) ) {
throw new UserException.BadInput("Sample name " + sampleName + " is not legal. Sample names must be non-empty and not contain leading or trailing whitespace");
}
if ( vcfPath == null ) {
throw new UserException.BadInput("VCF path for sample " + sampleName + " was null");
}

final URI previousPath = sampleNameToVcfPath.put(sampleName, vcfPath);
if (previousPath != null) {
throw new UserException.BadInput("Duplicate sample: " + sampleName + ". Sample was found in both "
+ vcfPath + " and " + previousPath + ".");
}

if (vcfIndexPath != null) {
final URI previousIndexPath = sampleNameToVcfIndexPath.put(sampleName, vcfIndexPath);
if (previousIndexPath != null) {
throw new UserException.BadInput("For sample " + sampleName + ", attempted to specify multiple indices: " + vcfIndexPath + " and " + previousIndexPath);
}
}
}

/**
* @return The full mapping of sample names -> VCF paths, with the sample names in sorted order
*/
public SortedMap<String, URI> getSampleNameToVcfPath() {
return sampleNameToVcfPath;
}

/**
* @param sample sample name
* @return the VCF associated with that sample name, as a URI
*/
public URI getVCFForSample(final String sample) {
return sampleNameToVcfPath.get(sample);
}

/**
* @param sample sample name
* @return the VCF associated with that sample name, as a Path
*/
public Path getVCFForSampleAsPath(final String sample) {
final URI vcfURI = sampleNameToVcfPath.get(sample);
return vcfURI == null ? null : IOUtils.getPath(vcfURI.toString());
}

/**
* @param sample sample name
* @return the VCF index associated with that sample name, as a URI, or null if no index
*/
public URI getVCFIndexForSample(final String sample) {
return sampleNameToVcfIndexPath.get(sample);
}

/**
* @param sample sample name
* @return the VCF index associated with that sample name, as a Path, or null if no index
*/
public Path getVCFIndexForSampleAsPath(final String sample) {
final URI vcfIndexURI = sampleNameToVcfIndexPath.get(sample);
return vcfIndexURI == null ? null : IOUtils.getPath(vcfIndexURI.toString());
}

/**
* @return number of samples in this Map
*/
public int getNumSamples() {
return sampleNameToVcfPath.size();
}

/**
* @return a List of the sample names in this Map in sorted order
*/
public List<String> getSampleNamesInSortedOrder() {
return new ArrayList<>(sampleNameToVcfPath.keySet());
}

/**
* @return true if an index was specified for at least one sample, otherwise false
*/
public boolean indicesSpecified() {
return ! sampleNameToVcfIndexPath.isEmpty();
}
}
Loading

0 comments on commit 19778c1

Please sign in to comment.