-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Change UpdateVCFSequenceDictionary to use the specified dictionary uniformly. #5093
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,7 +106,7 @@ public void onTraversalStart() { | |
new VCFHeader() : | ||
new VCFHeader(inputHeader.getMetaDataInInputOrder(), inputHeader.getGenotypeSamples()) ; | ||
getDefaultToolVCFHeaderLines().forEach(line -> outputHeader.addMetaDataLine(line)); | ||
sourceDictionary = getSequenceDictionaryFromInput(dictionarySource); | ||
sourceDictionary = getBestAvailableSequenceDictionary(); | ||
|
||
// Warn and require opt-in via -replace if we're about to clobber a valid sequence | ||
// dictionary. Check the input file directly via the header rather than using the | ||
|
@@ -167,27 +167,44 @@ public void closeTool() { | |
} | ||
} | ||
|
||
// either throws or returns a valid seq dict | ||
private SAMSequenceDictionary getSequenceDictionaryFromInput(final String source) { | ||
SAMSequenceDictionary dictionary; | ||
if (source == null) { | ||
if (hasReference()) { | ||
dictionary = getReferenceDictionary(); | ||
// Override getBestAvailableSequenceDictionary() to ensure that the new sequence dictionary specified by | ||
// the user (and not the sequence dictionary from the VCF we're trying to update) is used uniformly by all | ||
// callers. Otherwise, the wrong dictionary would be used when writing the index for the output vcf. | ||
// | ||
@Override | ||
public SAMSequenceDictionary getBestAvailableSequenceDictionary() { | ||
|
||
SAMSequenceDictionary resultDictionary; | ||
|
||
final SAMSequenceDictionary masterDictionary = getMasterSequenceDictionary(); | ||
if (dictionarySource == null) { | ||
if (masterDictionary != null) { | ||
// We'll accept the master dictionary if one was specified. Using the master dictionary | ||
// arg will result in sequence dictionary validation. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the case where invalid dictionaries for the data are used (such as in #5087), can we detect it and throw a warning? If the user specifies a bad sequence dictionary we should let them know that something bad could happen. (This is essentially the same request as the above comment.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The tool does have code to validate the new SD (in the first few lines of the apply method), and there are also tests for that. The problem in #5087 was that the indexer was getting the old (invalid) SD via the standard GATK machinery that uses the SD from the input. The change in this PR was to unify those so the new one is used everywhere. |
||
logger.warn("Using the dictionary supplied via the \"%s\" argument", StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME); | ||
resultDictionary = masterDictionary; | ||
} | ||
else if (hasReference()) { | ||
resultDictionary = getReferenceDictionary(); | ||
} else { | ||
throw new CommandLineException.MissingArgument( | ||
DICTIONARY_ARGUMENT_NAME, "A dictionary source file or reference file must be provided"); | ||
} | ||
|
||
} else { | ||
dictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource)); | ||
if (dictionary == null || dictionary.getSequences().isEmpty()) { | ||
if (masterDictionary != null) { | ||
throw new CommandLineException(String.format("Only one of %s or %s may be specified on the command line", | ||
StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, | ||
DICTIONARY_ARGUMENT_NAME)); | ||
} | ||
resultDictionary = SAMSequenceDictionaryExtractor.extractDictionary(IOUtils.getPath(dictionarySource)); | ||
if (resultDictionary == null || resultDictionary.getSequences().isEmpty()) { | ||
throw new CommandLineException.BadArgumentValue( | ||
String.format( | ||
"The specified dictionary source has an empty or invalid sequence dictionary", | ||
dictionarySource) | ||
String.format( | ||
"The specified dictionary source has an empty or invalid sequence dictionary", | ||
dictionarySource) | ||
); | ||
} | ||
} | ||
return dictionary; | ||
return resultDictionary; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import htsjdk.variant.vcf.VCFHeader; | ||
import org.broadinstitute.barclay.argparser.CommandLineException; | ||
import org.broadinstitute.hellbender.CommandLineProgramTest; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; | ||
|
||
import org.testng.Assert; | ||
|
@@ -14,26 +15,29 @@ | |
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.net.URISyntaxException; | ||
import java.nio.file.Paths; | ||
import java.util.stream.Collectors; | ||
|
||
public class UpdateVCFSequenceDictionaryIntegrationTest extends CommandLineProgramTest { | ||
private File testDir = new File(getTestDataDir(), "walkers/variantutils/UpdateVCFSequenceDictionary"); | ||
|
||
@DataProvider(name="UpdateGoodSequenceDictionaryData") | ||
public Object[][] updateGoodSequenceDictionaryData() { | ||
return new Object[][]{ | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, | ||
// pass a reference as a reference | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), null, new File(testDir, "exampleFASTA.fasta"), null, false}, | ||
// pass a reference as a source; we need to test both to ensure the user can bypass the framework sequence | ||
// dictionary validation that will occur if you use -R | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.fasta"), null, null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, false}, | ||
// can't handle CRAM - see https://github.com/samtools/htsjdk/issues/731 | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "exampleBAM.bam"), null, null, false}, | ||
// already has a dictionary - but force a replace | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true}, | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true}, | ||
// already has a dictionary - but force a replace | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, true}, | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "exampleFASTA.dict"), null, null, true}, | ||
}; | ||
} | ||
|
||
|
@@ -42,9 +46,10 @@ private void testGoodUpdateSequenceDictionary( | |
final File inputVariantsFile, | ||
final File inputSourceFile, | ||
final File inputReferenceFile, | ||
final boolean replace) throws FileNotFoundException { | ||
final String masterSequenceDictionary, | ||
final boolean replace) throws FileNotFoundException, URISyntaxException { | ||
final SAMSequenceDictionary resultingDictionary = | ||
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace); | ||
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace); | ||
|
||
// get the original sequence dictionary from the source for comparison | ||
SAMSequenceDictionary sourceDictionary = | ||
|
@@ -69,12 +74,12 @@ private void testGoodUpdateSequenceDictionary( | |
public Object[][] updateBadSequenceDictionaryData() { | ||
return new Object[][]{ | ||
// already has a dictionary | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, | ||
new Object[]{ new File(testDir, "variantsWithDict.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, | ||
// source has no dictionary | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDict.vcf"), new File(testDir, "variantsNoDict.vcf"), null, null, false}, | ||
// source dictionary is a mismatch for the variant records | ||
new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDictWithBadContig.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, | ||
new Object[]{ new File(testDir, "variantsNoDictWithBadContigLength.vcf"), new File(testDir, "variantsWithDict.vcf"), null, null, false}, | ||
}; | ||
} | ||
|
||
|
@@ -83,14 +88,45 @@ private void testBadUpdateSequenceDictionary( | |
final File inputVariantsFile, | ||
final File inputSourceFile, | ||
final File inputReferenceFile, | ||
final String masterSequenceDictionary, | ||
final boolean replace) { | ||
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, replace); | ||
updateSequenceDictionary(inputVariantsFile, inputSourceFile, inputReferenceFile, masterSequenceDictionary, replace); | ||
} | ||
|
||
@Test | ||
public void testUseMasterDictionary() { | ||
final SAMSequenceDictionary actualSequenceDictionary = updateSequenceDictionary( | ||
new File(testDir, "variantsNoDict.vcf"), | ||
null, | ||
null, | ||
new File(testDir, "exampleFASTA.dict").getAbsolutePath(), | ||
false); | ||
final SAMSequenceDictionary expectedSequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary( | ||
Paths.get(new File(testDir, "exampleFASTA.dict").getAbsolutePath())); | ||
|
||
// verify only the sequence names and lengths, since other attributes such as MD/UR will have been updated | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cmnbroad Is it worth verifying that the other attributes are different? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it is. In real life usage, whether or not these attributes these attributes change depends on the state of the original inputs. Since this test uses a starting vcf that has no sequence dictionary at all, the MD5 and UR attributes get added on, but we just need to validate that the correct SD got propagated. |
||
Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList()), | ||
expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceName()).collect(Collectors.toList())); | ||
Assert.assertEquals(actualSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList()), | ||
expectedSequenceDictionary.getSequences().stream().map(seq -> seq.getSequenceLength()).collect(Collectors.toList())); | ||
} | ||
|
||
@Test(expectedExceptions=CommandLineException.class) | ||
public void testMasterDictionaryAmbiguous() { | ||
// specifying both a source dictionary and a master dictionary is ambiguous | ||
updateSequenceDictionary( | ||
new File(testDir, "variantsNoDict.vcf"), | ||
new File(testDir, "exampleFASTA.dict"), | ||
null, | ||
new File(testDir, "exampleFASTA.dict").getAbsolutePath(), | ||
false); | ||
} | ||
|
||
private SAMSequenceDictionary updateSequenceDictionary( | ||
final File inputVariantsFile, | ||
final File inputSourceFile, | ||
final File inputReferenceFile, | ||
final String masterSequenceDictionary, | ||
final boolean replace) | ||
{ | ||
ArgumentsBuilder argBuilder = new ArgumentsBuilder(); | ||
|
@@ -105,12 +141,15 @@ private SAMSequenceDictionary updateSequenceDictionary( | |
if (replace) { | ||
argBuilder.addArgument(UpdateVCFSequenceDictionary.REPLACE_ARGUMENT_NAME, Boolean.toString(replace)); | ||
} | ||
if (masterSequenceDictionary != null) { | ||
argBuilder.addArgument(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, masterSequenceDictionary); | ||
} | ||
|
||
File outFile = createTempFile("updateSequenceDictionary", ".vcf"); | ||
File outFile = createTempFile("updateSequenceDictionary", ".vcf.gz"); | ||
argBuilder.addOutput(outFile); | ||
runCommandLine(argBuilder.getArgsList()); | ||
|
||
// Don't require an index, since the framework doesn't create one if no input sequnce | ||
// Don't require an index, since the framework doesn't create one if no input sequence | ||
// dictionary is available via getBestAvailableSequenceDictionary. | ||
try (VCFFileReader vcfReader = new VCFFileReader(outFile, false)) { | ||
return vcfReader.getFileHeader().getSequenceDictionary(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you throw in a logging statement that the dictionary is being overridden? I know it'll be captured in the arguments used to call a tool, but it seems like another explicit warning would be good (particularly in cases where the specified dictionary is different from the embedded dictionary in the file / etc).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, done.