Skip to content

Commit

Permalink
Change UpdateVCFSequenceDictionary to use the specified dictionary un…
Browse files Browse the repository at this point in the history
…iformly. (#5093)
  • Loading branch information
cmnbroad authored Feb 11, 2019
1 parent a84f466 commit e8069b7
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void initializeFeatures() {
* choose the sequence dictionary from the driving source of variants.
*/
@Override
public final SAMSequenceDictionary getBestAvailableSequenceDictionary() {
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
final SAMSequenceDictionary dictFromDrivingVariants = getSequenceDictionaryForDrivingVariants();
if (dictFromDrivingVariants != null) {
//If this dictionary looks like it was synthesized from a feature index, see
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public void onTraversalStart() {
new VCFHeader() :
new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ;
getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line));
sourceDictionary = getSequenceDictionaryFromInput(dictionarySource);
sourceDictionary = getBestAvailableSequenceDictionary();

// Warn and require opt-in via -replace if we're about to clobber a valid sequence
// dictionary. Check the input file directly via the header rather than using the
Expand Down Expand Up @@ -167,27 +167,44 @@ public void closeTool() {
}
}

// either throws or returns a valid seq dict
private SAMSequenceDictionary getSequenceDictionaryFromInput(final String source) {
SAMSequenceDictionary dictionary;
if (source == null) {
if (hasReference()) {
dictionary = getReferenceDictionary();
// Override getBestAvailableSequenceDictionary() to ensure that the new sequence dictionary specified by
// the user (and not the sequence dictionary from the VCF we're trying to update) is used uniformly by all
// callers. Otherwise, the wrong dictionary would be used when writing the index for the output vcf.
//
@Override
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {

SAMSequenceDictionary resultDictionary;

final SAMSequenceDictionary masterDictionary = getMasterSequenceDictionary();
if (dictionarySource == null) {
if (masterDictionary != null) {
// We'll accept the master dictionary if one was specified. Using the master dictionary
// arg will result in sequence dictionary validation.
logger.warn("Using the dictionary supplied via the \"%s\" argument", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME);
resultDictionary = masterDictionary;
}
else if (hasReference()) {
resultDictionary = getReferenceDictionary();
} else {
throw new CommandLineException.MissingArgument(
DICTIONARY_ARGUMENT_NAME, "A dictionary source file or reference file must be provided");
}

} else {
dictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource));
if (dictionary == null || dictionary.getSequences().isEmpty()) {
if (masterDictionary != null) {
throw new CommandLineException(String.format("Only one of %s or %s may be specified on the command line",
StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME,
DICTIONARY_ARGUMENT_NAME));
}
resultDictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource));
if (resultDictionary == null || resultDictionary.getSequences().isEmpty()) {
throw new CommandLineException.BadArgumentValue(
String.format(
"The specified dictionary source has an empty or invalid sequence dictionary",
dictionarySource)
String.format(
"The specified dictionary source has an empty or invalid sequence dictionary",
dictionarySource)
);
}
}
return dictionary;
return resultDictionary;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;

import org.testng.Assert;
Expand All @@ -14,26 +15,29 @@

import java.io.File;
import java.io.FileNotFoundException;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.stream.Collectors;

public class UpdateVCFSequenceDictionaryIntegrationTest extends CommandLineProgramTest {
private File testDir = new File(getTestDataDir(), "walkers/variantutils/UpdateVCFSequenceDictionary");

@DataProvider(name="UpdateGoodSequenceDictionaryData")
public Object[][] updateGoodSequenceDictionaryData() {
return new Object[][]{
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false},
// pass a reference as a reference
new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), null, false},
// pass a reference as a source; we need to test both to ensure the user can bypass the framework sequence
// dictionary validation that will occur if you use -R
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, false},
// can't handle CRAM - see https://github.com/samtools/htsjdk/issues/731
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, null, false},
// already has a dictionary - but force a replace
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true},
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true},
// already has a dictionary - but force a replace
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true},
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true},
};
}

Expand All @@ -42,9 +46,10 @@ private void testGoodUpdateSequenceDictionary(
final File inputVariantsFile,
final File inputSourceFile,
final File inputReferenceFile,
final boolean replace) throws FileNotFoundException {
final String masterSequenceDictionary,
final boolean replace) throws FileNotFoundException, URISyntaxException {
final SAMSequenceDictionary resultingDictionary =
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace);
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace);

// get the original sequence dictionary from the source for comparison
SAMSequenceDictionary sourceDictionary =
Expand All @@ -69,12 +74,12 @@ private void testGoodUpdateSequenceDictionary(
public Object[][] updateBadSequenceDictionaryData() {
return new Object[][]{
// already has a dictionary
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false},
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false},
// source has no dictionary
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, false},
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, null, false},
// source dictionary is a mismatch for the variant records
new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false},
new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false},
new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false},
new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false},
};
}

Expand All @@ -83,14 +88,45 @@ private void testBadUpdateSequenceDictionary(
final File inputVariantsFile,
final File inputSourceFile,
final File inputReferenceFile,
final String masterSequenceDictionary,
final boolean replace) {
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace);
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace);
}

@Test
public void testUseMasterDictionary() {
final SAMSequenceDictionary actualSequenceDictionary = updateSequenceDictionary(
new File(testDir, "variantsNoDict.vcf"),
null,
null,
new File(testDir, "exampleFASTA.dict").getAbsolutePath(),
false);
final SAMSequenceDictionary expectedSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(
Paths.get(new File(testDir, "exampleFASTA.dict").getAbsolutePath()));

// verify only the sequence names and lengths, since other attributes such as MD/UR will have been updated
Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList()),
expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList()));
Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList()),
expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList()));
}

@Test(expectedExceptions=CommandLineException.class)
public void testMasterDictionaryAmbiguous() {
// specifying both a source dictionary and a master dictionary is ambiguous
updateSequenceDictionary(
new File(testDir, "variantsNoDict.vcf"),
new File(testDir, "exampleFASTA.dict"),
null,
new File(testDir, "exampleFASTA.dict").getAbsolutePath(),
false);
}

private SAMSequenceDictionary updateSequenceDictionary(
final File inputVariantsFile,
final File inputSourceFile,
final File inputReferenceFile,
final String masterSequenceDictionary,
final boolean replace)
{
ArgumentsBuilder argBuilder = new ArgumentsBuilder();
Expand All @@ -105,12 +141,15 @@ private SAMSequenceDictionary updateSequenceDictionary(
if (replace) {
argBuilder.addArgument(UpdateVCFSequenceDictionary.REPLACE_ARGUMENT_NAME, Boolean.toString(replace));
}
if (masterSequenceDictionary != null) {
argBuilder.addArgument(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, masterSequenceDictionary);
}

File outFile = createTempFile("updateSequenceDictionary", ".vcf");
File outFile = createTempFile("updateSequenceDictionary", ".vcf.gz");
argBuilder.addOutput(outFile);
runCommandLine(argBuilder.getArgsList());

// Don't require an index, since the framework doesn't create one if no input sequnce
// Don't require an index, since the framework doesn't create one if no input sequence
// dictionary is available via getBestAvailableSequenceDictionary.
try (VCFFileReader vcfReader = new VCFFileReader(outFile, false)) {
return vcfReader.getFileHeader().getSequenceDictionary();
Expand Down

0 comments on commit e8069b7

Please sign in to comment.