From e8069b73a43ee7098b3d999a97addb9a1bb03841 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 11 Feb 2019 09:58:36 -0500 Subject: [PATCH] Change UpdateVCFSequenceDictionary to use the specified dictionary uniformly. (#5093) --- .../hellbender/engine/VariantWalkerBase.java | 2 +- .../UpdateVCFSequenceDictionary.java | 45 ++++++++---- ...eVCFSequenceDictionaryIntegrationTest.java | 71 ++++++++++++++----- 3 files changed, 87 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java index f8cae61c9b2..3f04d02974b 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java @@ -62,7 +62,7 @@ void initializeFeatures() { * choose the sequence dictionary from the driving source of variants. */ @Override - public final SAMSequenceDictionary getBestAvailableSequenceDictionary() { + public SAMSequenceDictionary getBestAvailableSequenceDictionary() { final SAMSequenceDictionary dictFromDrivingVariants = getSequenceDictionaryForDrivingVariants(); if (dictFromDrivingVariants != null) { //If this dictionary looks like it was synthesized from a feature index, see diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionary.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionary.java index c4ad92ae7f6..26bb519d2a9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionary.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionary.java @@ -106,7 +106,7 @@ public void onTraversalStart() { new VCFHeader() : new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ; getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line)); - sourceDictionary = getSequenceDictionaryFromInput(dictionarySource); + sourceDictionary = getBestAvailableSequenceDictionary(); // Warn and require opt-in via -replace if we're about to clobber a valid sequence // dictionary. Check the input file directly via the header rather than using the @@ -167,27 +167,44 @@ public void closeTool() { } } - // either throws or returns a valid seq dict - private SAMSequenceDictionary getSequenceDictionaryFromInput(final String source) { - SAMSequenceDictionary dictionary; - if (source == null) { - if (hasReference()) { - dictionary = getReferenceDictionary(); + // Override getBestAvailableSequenceDictionary() to ensure that the new sequence dictionary specified by + // the user (and not the sequence dictionary from the VCF we're trying to update) is used uniformly by all + // callers. Otherwise, the wrong dictionary would be used when writing the index for the output vcf. + // + @Override + public SAMSequenceDictionary getBestAvailableSequenceDictionary() { + + SAMSequenceDictionary resultDictionary; + + final SAMSequenceDictionary masterDictionary = getMasterSequenceDictionary(); + if (dictionarySource == null) { + if (masterDictionary != null) { + // We'll accept the master dictionary if one was specified. Using the master dictionary + // arg will result in sequence dictionary validation. + logger.warn("Using the dictionary supplied via the \"%s\" argument", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME); + resultDictionary = masterDictionary; + } + else if (hasReference()) { + resultDictionary = getReferenceDictionary(); } else { throw new CommandLineException.MissingArgument( DICTIONARY_ARGUMENT_NAME, "A dictionary source file or reference file must be provided"); } - } else { - dictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource)); - if (dictionary == null || dictionary.getSequences().isEmpty()) { + if (masterDictionary != null) { + throw new CommandLineException(String.format("Only one of %s or %s may be specified on the command line", + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, + DICTIONARY_ARGUMENT_NAME)); + } + resultDictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource)); + if (resultDictionary == null || resultDictionary.getSequences().isEmpty()) { throw new CommandLineException.BadArgumentValue( - String.format( - "The specified dictionary source has an empty or invalid sequence dictionary", - dictionarySource) + String.format( + "The specified dictionary source has an empty or invalid sequence dictionary", + dictionarySource) ); } } - return dictionary; + return resultDictionary; } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionaryIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionaryIntegrationTest.java index 8f13793f628..0a0f5498b92 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionaryIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/UpdateVCFSequenceDictionaryIntegrationTest.java @@ -6,6 +6,7 @@ import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.testng.Assert; @@ -14,6 +15,9 @@ import java.io.File; import java.io.FileNotFoundException; +import java.net.URISyntaxException; +import java.nio.file.Paths; +import java.util.stream.Collectors; public class UpdateVCFSequenceDictionaryIntegrationTest extends CommandLineProgramTest { private File testDir = new File(getTestDataDir(), "walkers/variantutils/UpdateVCFSequenceDictionary"); @@ -21,19 +25,19 @@ public class UpdateVCFSequenceDictionaryIntegrationTest extends CommandLineProgr @DataProvider(name="UpdateGoodSequenceDictionaryData") public Object[][] updateGoodSequenceDictionaryData() { return new Object[][]{ - new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, // pass a reference as a reference - new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), null, false}, // pass a reference as a source; we need to test both to ensure the user can bypass the framework sequence // dictionary validation that will occur if you use -R - new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, false}, - new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, null, false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, false}, // can't handle CRAM - see https://github.com/samtools/htsjdk/issues/731 - new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, null, false}, // already has a dictionary - but force a replace - new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true}, + new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true}, // already has a dictionary - but force a replace - new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true}, + new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true}, }; } @@ -42,9 +46,10 @@ private void testGoodUpdateSequenceDictionary( final File inputVariantsFile, final File inputSourceFile, final File inputReferenceFile, - final boolean replace) throws FileNotFoundException { + final String masterSequenceDictionary, + final boolean replace) throws FileNotFoundException, URISyntaxException { final SAMSequenceDictionary resultingDictionary = - updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace); + updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace); // get the original sequence dictionary from the source for comparison SAMSequenceDictionary sourceDictionary = @@ -69,12 +74,12 @@ private void testGoodUpdateSequenceDictionary( public Object[][] updateBadSequenceDictionaryData() { return new Object[][]{ // already has a dictionary - new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, + new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, // source has no dictionary - new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, false}, + new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, null, false}, // source dictionary is a mismatch for the variant records - new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, - new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, + new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, + new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, }; } @@ -83,14 +88,45 @@ private void testBadUpdateSequenceDictionary( final File inputVariantsFile, final File inputSourceFile, final File inputReferenceFile, + final String masterSequenceDictionary, final boolean replace) { - updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace); + updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace); + } + + @Test + public void testUseMasterDictionary() { + final SAMSequenceDictionary actualSequenceDictionary = updateSequenceDictionary( + new File(testDir, "variantsNoDict.vcf"), + null, + null, + new File(testDir, "exampleFASTA.dict").getAbsolutePath(), + false); + final SAMSequenceDictionary expectedSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary( + Paths.get(new File(testDir, "exampleFASTA.dict").getAbsolutePath())); + + // verify only the sequence names and lengths, since other attributes such as MD/UR will have been updated + Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList()), + expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList())); + Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList()), + expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList())); + } + + @Test(expectedExceptions=CommandLineException.class) + public void testMasterDictionaryAmbiguous() { + // specifying both a source dictionary and a master dictionary is ambiguous + updateSequenceDictionary( + new File(testDir, "variantsNoDict.vcf"), + new File(testDir, "exampleFASTA.dict"), + null, + new File(testDir, "exampleFASTA.dict").getAbsolutePath(), + false); } private SAMSequenceDictionary updateSequenceDictionary( final File inputVariantsFile, final File inputSourceFile, final File inputReferenceFile, + final String masterSequenceDictionary, final boolean replace) { ArgumentsBuilder argBuilder = new ArgumentsBuilder(); @@ -105,12 +141,15 @@ private SAMSequenceDictionary updateSequenceDictionary( if (replace) { argBuilder.addArgument(UpdateVCFSequenceDictionary.REPLACE_ARGUMENT_NAME, Boolean.toString(replace)); } + if (masterSequenceDictionary != null) { + argBuilder.addArgument(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, masterSequenceDictionary); + } - File outFile = createTempFile("updateSequenceDictionary", ".vcf"); + File outFile = createTempFile("updateSequenceDictionary", ".vcf.gz"); argBuilder.addOutput(outFile); runCommandLine(argBuilder.getArgsList()); - // Don't require an index, since the framework doesn't create one if no input sequnce + // Don't require an index, since the framework doesn't create one if no input sequence // dictionary is available via getBestAvailableSequenceDictionary. try (VCFFileReader vcfReader = new VCFFileReader(outFile, false)) { return vcfReader.getFileHeader().getSequenceDictionary();