From 5648af96f499bd389943602b86057be803191426 Mon Sep 17 00:00:00 2001 From: Orli Cohen Date: Wed, 27 Jul 2022 13:19:33 -0400 Subject: [PATCH] responding to review comments, also added unit test for generateReferencePairsAgainstDictionary() --- .../CheckReferenceCompatibility.java | 134 ++++++++++-------- .../reference/ReferenceSequenceTable.java | 6 +- ...ReferenceCompatibilityIntegrationTest.java | 45 +++--- .../ReferenceSequenceTableUnitTest.java | 35 ++++- ...eCompatibilityBAMWithMD5s_exactmatch.table | 4 +- ...mpatibilityBAMWithMD5s_notcompatible.table | 4 +- ...renceCompatibilityBAMWithMD5s_subset.table | 4 +- ...mpatibilityBAMWithoutMD5s_compatible.table | 4 +- ...tibilityBAMWithoutMD5s_notcompatible.table | 4 +- ...ceCompatibilityBAMWithoutMD5s_subset.table | 4 +- ...ibilityMultipleReferencesBAMWithMD5s.table | 9 +- ...lityMultipleReferencesBAMWithoutMD5s.table | 8 +- ...estReferenceCompatibilityVCFWithMD5s.table | 6 +- ...nceCompatibilityVCFWithoutMD5s_match.table | 4 +- ...tibilityVCFWithoutMD5s_notcompatible.table | 4 +- ...ceCompatibilityVCFWithoutMD5s_subset.table | 4 +- ...renceCompatibilityWithoutMD5Mismatch.table | 3 + 17 files changed, 167 insertions(+), 115 deletions(-) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityWithoutMD5Mismatch.table diff --git a/src/main/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility.java b/src/main/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility.java index 27c065c064c..3aeec046177 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility.java @@ -6,6 +6,8 @@ import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.engine.FeatureDataSource; @@ -30,8 +32,9 @@ * *

This tool generates a table analyzing the compatibility of a sequence file against provided references. The tool works to compare * BAM/CRAMs (specified using the -I argument) as well as VCFs (specified using the -V argument) against provided - * reference(s), specified using the -references-to-compare argument. The table can be directed to a file or standard - * output using provided command-line arguments. + * reference(s), specified using the -references-to-compare argument. When MD5s are present, the tool decides compatibility based on all sequence + * information (MD5, name, length); when MD5s are missing, the tool makes compatibility calls based only on sequence name + * and length. The table can be directed to a file or standard output using provided command-line arguments. *

* *

Input

@@ -45,9 +48,9 @@ *
  * #Current Reference: reads_data_source_test1_withmd5s_missingchr1.bam
  * Reference	Compatibility
- * hg19mini.fasta	Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
- * hg19mini_1renamed.fasta	Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1]
- * hg19mini_chr2snp.fasta	Not compatible. Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences.
+ * hg19mini.fasta	COMPATIBLE the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
+ * hg19mini_1renamed.fasta	COMPATIBLE_SUBSET the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1]
+ * hg19mini_chr2snp.fasta	NOT_COMPATIBLE Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences.
  * 
*

* @@ -62,11 +65,13 @@ */ @CommandLineProgramProperties( - summary = "", - oneLineSummary = "", + summary = "Check a BAM/VCF for compatibility against specified references and output a tab-delimited table detailing the compatibility status and relevant information about the status.", + oneLineSummary = "Check a BAM/VCF for compatibility against specified references.", programGroup = ReferenceProgramGroup.class ) +@DocumentedFeature +@ExperimentalFeature public class CheckReferenceCompatibility extends GATKTool { @Argument(fullName = "references-to-compare", shortName = "refcomp", doc = "Reference sequence file(s) to compare.") @@ -83,12 +88,14 @@ public class CheckReferenceCompatibility extends GATKTool { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "If specified, output reference sequence table in TSV format to this file. Otherwise print it to stdout.", optional = true) private GATKPath output; - private SAMSequenceDictionary dictionaryToCompare; - private GATKPath dictionaryPath; - private String dictionaryName; + private SAMSequenceDictionary queryDictionary; + private GATKPath queryDictionaryPath; + private String queryDictionaryName; private Map dictionaries = new LinkedHashMap<>(); private List referenceCompatibilities = new ArrayList<>(); + private boolean md5sPresent; + // sequence dictionary validation disabled since dictionaries may not be valid before running tool @Override protected SequenceDictionaryValidationArgumentCollection getSequenceDictionaryValidationArgumentCollection() { return new SequenceDictionaryValidationArgumentCollection.NoValidationCollection(); @@ -100,51 +107,59 @@ public void onTraversalStart() { } private void initializeSequenceDictionaryForInput(){ - // BAMs/CRAMs - if(hasReads() ^ vcfPath != null){ + if(hasReads() && vcfPath != null){ + throw new UserException.BadInput("Both BAM and VCF specified. Tool analyzes one input at a time."); + } + else{ + // BAMs/CRAMs if (hasReads()) { if (readArguments.getReadPathSpecifiers().size() > 1) { - throw new UserException.BadInput("Tool analyzes one BAM at a time."); + throw new UserException.BadInput("Tool analyzes one reads input at a time."); } - dictionaryToCompare = getHeaderForReads().getSequenceDictionary(); - dictionaryPath = readArguments.getReadPathSpecifiers().get(0); - dictionaryName = dictionaryPath.toPath().getFileName().toString(); + queryDictionary = getHeaderForReads().getSequenceDictionary(); + queryDictionaryPath = readArguments.getReadPathSpecifiers().get(0); + queryDictionaryName = queryDictionaryPath.toPath().getFileName().toString(); + md5sPresent = dictionaryHasMD5s(queryDictionary); } // VCFs else { - try(final FeatureDataSource vcfReader = new FeatureDataSource<>(vcfPath.toString())){ + try (final FeatureDataSource vcfReader = new FeatureDataSource<>(vcfPath.toString())) { VCFHeader header = (VCFHeader) vcfReader.getHeader(); - dictionaryToCompare = header.getSequenceDictionary(); - dictionaryPath = vcfPath; - dictionaryName = dictionaryPath.toPath().getFileName().toString(); + queryDictionary = header.getSequenceDictionary(); + queryDictionaryPath = vcfPath; + queryDictionaryName = queryDictionaryPath.toPath().getFileName().toString(); + md5sPresent = dictionaryHasMD5s(queryDictionary); } } - dictionaries.put(dictionaryPath, dictionaryToCompare); - for(GATKPath path : references){ + dictionaries.put(queryDictionaryPath, queryDictionary); + for (GATKPath path : references) { dictionaries.put(path, ReferenceDataSource.of(path.toPath()).getSequenceDictionary()); } + if (!hasReads() && vcfPath == null) { + throw new UserException.BadInput("No input provided."); + } } } @Override public void traverse() { - if(dictionaryHasMD5s(dictionaryToCompare)){ + if(md5sPresent){ ReferenceSequenceTable table = new ReferenceSequenceTable(dictionaries); table.build(); - List refPairs = table.compareAgainstKeyReference(dictionaryPath); + List refPairs = table.compareAgainstKeyReference(queryDictionaryPath); for(ReferencePair pair : refPairs){ - evaluateCompatibility(pair, table); + referenceCompatibilities.add(evaluateCompatibilityWithMD5Table(pair, table)); } } else{ - logger.warn("*************************************************************************************************************************"); + logger.warn("************************************************************************************************************************"); logger.warn("* Comparison lacking MD5. All comparisons based on sequence name and length, which could hide mismatching references. *"); - logger.warn("*************************************************************************************************************************"); + logger.warn("************************************************************************************************************************"); for(Map.Entry entry : dictionaries.entrySet()){ - if(!entry.getValue().equals(dictionaryToCompare)){ - evaluateCompatibility(entry.getValue(), entry.getKey()); + if(!entry.getValue().equals(queryDictionary)){ + referenceCompatibilities.add(evaluateCompatibilityWithoutMD5(entry.getValue(), entry.getKey())); } } } @@ -158,12 +173,12 @@ public void traverse() { * */ private void writeOutput(){ - TableColumnCollection columns = new TableColumnCollection(Arrays.asList("Reference", "Compatibility")); + TableColumnCollection columns = new TableColumnCollection(Arrays.asList("Reference", "Compatibility", "Summary")); try(CheckReferenceCompatibilityTableWriter writer = output == null ? new CheckReferenceCompatibilityTableWriter(new OutputStreamWriter(System.out), columns) : new CheckReferenceCompatibilityTableWriter(output.toPath(), columns) ){ - writer.writeComment(String.format("Current Reference: %s", dictionaryName)); + writer.writeComment(String.format("Current Reference: %s", queryDictionaryName)); writer.writeHeaderIfApplies(); for(CompatibilityRecord record : referenceCompatibilities){ writer.writeRecord(record); @@ -189,16 +204,16 @@ private boolean dictionaryHasMD5s(SAMSequenceDictionary dictionary){ * @param refPair ReferencePair containing the dictionary file and a reference it is being compared against * @param table */ - private void evaluateCompatibility(ReferencePair refPair, ReferenceSequenceTable table){ + private CompatibilityRecord evaluateCompatibilityWithMD5Table(ReferencePair refPair, ReferenceSequenceTable table){ EnumSet status = refPair.getAnalysis(); if(status.contains(ReferencePair.Status.EXACT_MATCH)){ - referenceCompatibilities.add(new CompatibilityRecord(refPair.getRef2(), "Compatible, the sequence dictionaries exactly match")); + return new CompatibilityRecord(refPair.getRef2(), CompatibilityRecord.Compatibility.COMPATIBLE, "The sequence dictionaries exactly match"); } else if(status.contains(ReferencePair.Status.SUBSET) && status.size() == 1){ - referenceCompatibilities.add(new CompatibilityRecord(refPair.getRef2(),String.format("Compatible, the sequence dictionary in %s is a subset of the %s reference sequence dictionary. Missing sequence(s): %s", refPair.getRef1AsString(), refPair.getRef2AsString(), getMissingSequencesIfSubset(dictionaries.get(refPair.getRef2()))))); + return new CompatibilityRecord(refPair.getRef2(), CompatibilityRecord.Compatibility.COMPATIBLE_SUBSET, String.format("The sequence dictionary in %s is a subset of the %s reference sequence dictionary. Missing sequence(s): %s", refPair.getRef1AsString(), refPair.getRef2AsString(), getMissingSequencesIfSubset(dictionaries.get(refPair.getRef2())))); } else{ - referenceCompatibilities.add(new CompatibilityRecord(refPair.getRef2(), String.format("Not compatible. Status: %s. Run CompareReferences tool for more information on reference differences.", status))); + return new CompatibilityRecord(refPair.getRef2(), CompatibilityRecord.Compatibility.NOT_COMPATIBLE, String.format("Status: %s. Run CompareReferences tool for more information on reference differences.", status)); } } @@ -209,56 +224,58 @@ else if(status.contains(ReferencePair.Status.SUBSET) && status.size() == 1){ * @param referenceDict SAMSequenceDictionary to be compared against the key sequence file * @param referenceDictPath path to provided dictionary */ - private void evaluateCompatibility(SAMSequenceDictionary referenceDict, GATKPath referenceDictPath){ + private CompatibilityRecord evaluateCompatibilityWithoutMD5(SAMSequenceDictionary referenceDict, GATKPath referenceDictPath){ String referenceDictName = referenceDictPath.toPath().getFileName().toString(); - SequenceDictionaryUtils.SequenceDictionaryCompatibility compatibilityStatus = SequenceDictionaryUtils.compareDictionaries(referenceDict, dictionaryToCompare, false); + SequenceDictionaryUtils.SequenceDictionaryCompatibility compatibilityStatus = SequenceDictionaryUtils.compareDictionaries(referenceDict, queryDictionary, false); if(compatibilityStatus.equals(SequenceDictionaryUtils.SequenceDictionaryCompatibility.IDENTICAL)){ - referenceCompatibilities.add(new CompatibilityRecord(referenceDictPath, "All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.")); + return new CompatibilityRecord(referenceDictPath, CompatibilityRecord.Compatibility.COMPATIBLE, "All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references."); } else if(compatibilityStatus.equals(SequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET)){ - referenceCompatibilities.add(new CompatibilityRecord(referenceDictPath, String.format("All sequence names and lengths present in the sequence dictionaries match, but %s is a subset of %s. Missing sequence(s): %s. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.", dictionaryName, referenceDictName, getMissingSequencesIfSubset(referenceDict)))); + return new CompatibilityRecord(referenceDictPath, CompatibilityRecord.Compatibility.COMPATIBLE_SUBSET, String.format("All sequence names and lengths present in the sequence dictionaries match, but %s is a subset of %s. Missing sequence(s): %s. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.", queryDictionaryName, referenceDictName, getMissingSequencesIfSubset(referenceDict))); } else{ - referenceCompatibilities.add(new CompatibilityRecord(referenceDictPath, String.format("Not compatible. Status: %s. Run CompareReferences tool for more information on reference differences.", compatibilityStatus))); + return new CompatibilityRecord(referenceDictPath, CompatibilityRecord.Compatibility.NOT_COMPATIBLE, String.format("Status: %s. Run CompareReferences tool for more information on reference differences.", compatibilityStatus)); } } private List getMissingSequencesIfSubset(SAMSequenceDictionary referenceDict){ - Set commonSequences = SequenceDictionaryUtils.getCommonContigsByName(dictionaryToCompare, referenceDict); + Set commonSequences = SequenceDictionaryUtils.getCommonContigsByName(queryDictionary, referenceDict); List missingSequences = SequenceDictionaryUtils.getContigNamesList(referenceDict); missingSequences.removeAll(commonSequences); return missingSequences; } - @Override - public Object onTraversalSuccess() { - return super.onTraversalSuccess(); - } - - @Override - public void closeTool() { - super.closeTool(); - } - /** * Minimal class representing a record in the compatibility output table. */ private static class CompatibilityRecord { - GATKPath ref; - String compatibilityOutput; + private final GATKPath ref; + private final Compatibility status; + + private enum Compatibility{ + COMPATIBLE, + COMPATIBLE_SUBSET, + NOT_COMPATIBLE + } + private final String summary; - CompatibilityRecord(GATKPath ref, String compatibility){ + CompatibilityRecord(GATKPath ref, Compatibility status, String summary){ this.ref = ref; - compatibilityOutput = compatibility; + this.status = status; + this.summary = summary; } public String getRefAsString(){ return ref.toPath().getFileName().toString(); } - public String getCompatibility(){ - return compatibilityOutput; + public Compatibility getCompatibilityStatus(){ + return status; + } + + public String getSummary(){ + return summary; } } @@ -278,7 +295,8 @@ public CheckReferenceCompatibilityTableWriter(final Writer writer, TableColumnCo @Override protected void composeLine(final CompatibilityRecord record, final DataLine dataLine){ dataLine.set("Reference", record.getRefAsString()) - .set("Compatibility", record.getCompatibility()); + .set("Compatibility", record.getCompatibilityStatus().toString()) + .set("Summary", record.getSummary()); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTable.java b/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTable.java index aaf6116155b..607bbc6cb5d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTable.java @@ -233,7 +233,7 @@ private String getMD5ForRecord(SAMSequenceRecord record, GATKPath referencePath) * * @return the list of ReferencePairs for every pair of references */ - public List generateReferencePairs(){ + public final List generateReferencePairs(){ List referencePairs = new ArrayList<>(); for(int i = 0; i < references.size(); i++){ for(int j = i + 1; j < references.size(); j++){ @@ -248,7 +248,7 @@ public List generateReferencePairs(){ * * @return the list of ReferencePairs */ - public List generateReferencePairs(GATKPath dictionary){ + public final List generateReferencePairsAgainstDictionary(GATKPath dictionary){ List referencePairs = new ArrayList<>(); for(int i = 0; i < references.size(); i++){ GATKPath currRef = references.get(i); @@ -274,7 +274,7 @@ public List compareAllReferences(){ * @return list of ReferencePairs keyed against the provided dictionary with updated status sets */ public List compareAgainstKeyReference(GATKPath dictionary){ - List refPairs = generateReferencePairs(dictionary); + List refPairs = generateReferencePairsAgainstDictionary(dictionary); return analyzeTable(refPairs); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibilityIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibilityIntegrationTest.java index 6a0d31b4c82..1b14301087a 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibilityIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibilityIntegrationTest.java @@ -11,7 +11,7 @@ public class CheckReferenceCompatibilityIntegrationTest extends CommandLineProgramTest { - private final String COMPARE_REFERENCES_TEST_FILES_DIRECTORY = "src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/"; + private final String COMPARE_REFERENCES_TEST_FILES_DIRECTORY = toolsTestDir + "/reference/CompareReferences/"; @DataProvider(name = "testReferenceCompatibilityBAMWithMD5sData") public Object[][] testReferenceCompatibilityBAMWithMD5sData() { @@ -97,13 +97,14 @@ public void testReferenceCompatibilityMultipleBAMs() throws IOException { public void testReferenceCompatibilityMultipleReferencesBAMWithMD5s() throws IOException { final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta"); final File ref2 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_1renamed.fasta"); - final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta"); + final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_missingchr1.fasta"); + final File ref4 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta"); final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withmd5s_missingchr1.bam"); final File output = createTempFile("testReferenceCompatibilityMultipleReferencesWithMD5s", ".table"); final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table"); final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath(), "-refcomp", ref2.getAbsolutePath(), - "-refcomp", ref3.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()}; + "-refcomp", ref3.getAbsolutePath(), "-refcomp", ref4.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()}; runCommandLine(args); IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput); @@ -133,23 +134,27 @@ public void testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s() throws public void testReferenceCompatibilityWithoutMD5sMismatch() throws IOException { final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta"); final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withoutmd5s.bam"); + final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table"); + final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityWithoutMD5Mismatch.table"); - final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", dict.getAbsolutePath()}; + final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()}; runCommandLine(args); + + IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput); } - // TODO: compatibility based on MD5 faulty since MD5s not in sequence dictionary (see ticket #730 "VCFHeader drops sequence dictionary attributes") - @Test(enabled = false) + // TODO: compatibility based on MD5 faulty since MD5s not in sequence dictionary (https://github.com/samtools/htsjdk/issues/730) + @Test public void testReferenceCompatibilityVCFWithMD5s() throws IOException { - final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_missingchr1.fasta"); - final File dict = new File(getToolTestDataDir() + "example_variants_withSequenceDict.vcf"); - //final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table"); + final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta"); + final File dict = new File(getToolTestDataDir() + "example_variants_withSequenceDict_withmd5.vcf"); + final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table"); final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityVCFWithMD5s.table"); - final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-V", dict.getAbsolutePath()/*, "-O", expectedOutput.getAbsolutePath()*/}; + final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-V", dict.getAbsolutePath(), "-O", output.getAbsolutePath()}; runCommandLine(args); - //IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput); + IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput); } @DataProvider(name = "testReferenceCompatibilityVCFWithoutMD5sData") @@ -187,19 +192,15 @@ public void testReferenceCompatibilityVCFWithoutMD5s(File ref1, File dict, File IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput); } - // for quick stdout testing - @Test(enabled = false) - public void testStdOutput() throws IOException{ - final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta"); - final File ref2 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_1renamed.fasta"); - final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta"); - final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withmd5s_missingchr1.bam"); - //final File output = createTempFile("testReferenceCompatibilityMultipleReferencesWithMD5s", ".table"); - final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table"); + @Test(expectedExceptions = UserException.BadInput.class) + public void testReferenceCompatibilityBAMandVCF() throws IOException { + final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta"); + final File bam = new File(getToolTestDataDir() + "reads_data_source_test1_withoutmd5s.bam"); + final File vcf = new File(getToolTestDataDir() + "example_variants_withSequenceDict_withoutmd5s.vcf"); - final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath(), "-refcomp", ref2.getAbsolutePath(), - "-refcomp", ref3.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", expectedOutput.getAbsolutePath()}; + final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", bam.getAbsolutePath(), "-V", vcf.getAbsolutePath()}; runCommandLine(args); } + } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTableUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTableUnitTest.java index cf3589826e0..50f0ff64986 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTableUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/reference/ReferenceSequenceTableUnitTest.java @@ -3,6 +3,8 @@ import htsjdk.samtools.SAMSequenceDictionary; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.engine.ReadsDataSource; +import org.broadinstitute.hellbender.engine.ReadsPathDataSource; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -24,9 +26,11 @@ private ReferenceSequenceTable tableGenerator(List references, Compare return table; } - private ReferenceSequenceTable tableGenerator(List references){ + private ReferenceSequenceTable tableGenerator(GATKPath dictionaryPath, SAMSequenceDictionary dictionary, List references){ Map dictionaries = new LinkedHashMap<>(); - for(GATKPath path : references){ + + dictionaries.put(dictionaryPath, dictionary); + for (GATKPath path : references) { dictionaries.put(path, ReferenceDataSource.of(path.toPath()).getSequenceDictionary()); } @@ -128,7 +132,6 @@ public Object[][] testGenerateReferencePairsData() { }, }; } - @Test(dataProvider = "testGenerateReferencePairsData") public void testGenerateReferencePairs(List references, int expectedPairs){ ReferenceSequenceTable table = tableGenerator(references, CompareReferences.MD5CalculationMode.USE_DICT); @@ -136,6 +139,32 @@ public void testGenerateReferencePairs(List references, int expectedPa Assert.assertEquals(pairs.size(), expectedPairs); } + @DataProvider(name = "testGenerateReferencePairsAgainstDictionaryData") + public Object[][] testGenerateReferencePairsAgainstDictionaryData() { + return new Object[][]{ + // references, dictionary, expected number of pairs + new Object[]{ Arrays.asList(new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini.fasta"), + new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_chr2snp.fasta")), + new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/reads_data_source_test1_withmd5s.bam"), + 2 + }, + new Object[]{ Arrays.asList(new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini.fasta"), + new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_chr2snp.fasta"), + new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_1renamed.fasta")), + new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/reads_data_source_test1_withmd5s.bam"), + 3 + }, + }; + } + @Test(dataProvider = "testGenerateReferencePairsAgainstDictionaryData") + public void testGenerateReferencePairsAgainstDictionary(List references, GATKPath dictPath, int expectedPairs){ + ReadsDataSource readsDataSource = new ReadsPathDataSource(dictPath.toPath()); + SAMSequenceDictionary dictionary = readsDataSource.getSequenceDictionary(); + ReferenceSequenceTable table = tableGenerator(dictPath, dictionary, references); + List pairs = table.generateReferencePairsAgainstDictionary(dictPath); + Assert.assertEquals(pairs.size(), expectedPairs); + } + @DataProvider(name = "testAnalyzeTableTwoRefsData") public Object[][] testAnalyzeTableTwoReferencesData(){ return new Object[][]{ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_exactmatch.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_exactmatch.table index 99cfa09587a..bcafd1ee67a 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_exactmatch.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_exactmatch.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withmd5s.bam -Reference Compatibility -hg19mini.fasta Compatible, the sequence dictionaries exactly match +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE The sequence dictionaries exactly match diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_notcompatible.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_notcompatible.table index 3ab355d2b8f..768aa521ac2 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_notcompatible.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_notcompatible.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withmd5s.bam -Reference Compatibility -hg19mini_missingchr3.fasta Not compatible. Status: [SUPERSET]. Run CompareReferences tool for more information on reference differences. +Reference Compatibility Summary +hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: [SUPERSET]. Run CompareReferences tool for more information on reference differences. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_subset.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_subset.table index 72e7eabb4d0..ced76d5e140 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_subset.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithMD5s_subset.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withmd5s_missingchr1.bam -Reference Compatibility -hg19mini.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1] +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1] diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_compatible.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_compatible.table index 75912a580f0..242e07e75ec 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_compatible.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_compatible.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withoutmd5s.bam -Reference Compatibility -hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_notcompatible.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_notcompatible.table index 09d6931695d..f817aa9f883 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_notcompatible.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_notcompatible.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withoutmd5s.bam -Reference Compatibility -hg19mini_missingchr3.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. +Reference Compatibility Summary +hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_subset.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_subset.table index 0b2df3ab98c..c1f41e754c8 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_subset.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityBAMWithoutMD5s_subset.table @@ -1,3 +1,3 @@ #Current Reference: reads_data_source_test1_withoutmd5s_missingchr1.bam -Reference Compatibility -hg19mini.fasta All sequence names and lengths present in the sequence dictionaries match, but reads_data_source_test1_withoutmd5s_missingchr1.bam is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE_SUBSET All sequence names and lengths present in the sequence dictionaries match, but reads_data_source_test1_withoutmd5s_missingchr1.bam is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table index 670ae47a905..a3fd1405b0d 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table @@ -1,5 +1,6 @@ #Current Reference: reads_data_source_test1_withmd5s_missingchr1.bam -Reference Compatibility -hg19mini.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1] -hg19mini_1renamed.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1] -hg19mini_chr2snp.fasta Not compatible. Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1] +hg19mini_1renamed.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1] +hg19mini_missingchr1.fasta COMPATIBLE The sequence dictionaries exactly match +hg19mini_chr2snp.fasta NOT_COMPATIBLE Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s.table index 3ef60afbe4b..a7955bd5f8a 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s.table @@ -1,5 +1,5 @@ #Current Reference: reads_data_source_test1_withoutmd5s.bam -Reference Compatibility -hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. -hg19mini_1renamed.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. -hg19mini_chr2snp.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +hg19mini_1renamed.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. +hg19mini_chr2snp.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithMD5s.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithMD5s.table index 2f952ef30cf..d2d3c994c87 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithMD5s.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithMD5s.table @@ -1,3 +1,3 @@ -#Current Reference: example_variants_withSequenceDict.vcf -Reference Compatibility -hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +#Current Reference: example_variants_withSequenceDict_withmd5.vcf +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_match.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_match.table index 384d5f32154..353715c5671 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_match.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_match.table @@ -1,3 +1,3 @@ #Current Reference: example_variants_withSequenceDict_withoutmd5s.vcf -Reference Compatibility -hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_notcompatible.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_notcompatible.table index c731f0731b5..7190d8d95ee 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_notcompatible.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_notcompatible.table @@ -1,3 +1,3 @@ #Current Reference: example_variants_withSequenceDict_withoutmd5s.vcf -Reference Compatibility -hg19mini_missingchr3.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. +Reference Compatibility Summary +hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_subset.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_subset.table index 544be416685..0b7f650166a 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_subset.table +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityVCFWithoutMD5s_subset.table @@ -1,3 +1,3 @@ #Current Reference: example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf -Reference Compatibility -hg19mini.fasta All sequence names and lengths present in the sequence dictionaries match, but example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. +Reference Compatibility Summary +hg19mini.fasta COMPATIBLE_SUBSET All sequence names and lengths present in the sequence dictionaries match, but example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references. diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityWithoutMD5Mismatch.table b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityWithoutMD5Mismatch.table new file mode 100644 index 00000000000..d0b1d42ba79 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/expected.testReferenceCompatibilityWithoutMD5Mismatch.table @@ -0,0 +1,3 @@ +#Current Reference: reads_data_source_test1_withoutmd5s.bam +Reference Compatibility Summary +hg19mini_chr2snp.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.