Skip to content

Commit

Permalink
responding to review comments, also added unit test for generateRefer…
Browse files Browse the repository at this point in the history
…encePairsAgainstDictionary()
  • Loading branch information
orlicohen committed Jul 27, 2022
1 parent 7e3e1fc commit 5648af9
Show file tree
Hide file tree
Showing 17 changed files with 167 additions and 115 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ private String getMD5ForRecord(SAMSequenceRecord record, GATKPath referencePath)
*
* @return the list of ReferencePairs for every pair of references
*/
public List<ReferencePair> generateReferencePairs(){
public final List<ReferencePair> generateReferencePairs(){
List<ReferencePair> referencePairs = new ArrayList<>();
for(int i = 0; i < references.size(); i++){
for(int j = i + 1; j < references.size(); j++){
Expand All @@ -248,7 +248,7 @@ public List<ReferencePair> generateReferencePairs(){
*
* @return the list of ReferencePairs
*/
public List<ReferencePair> generateReferencePairs(GATKPath dictionary){
public final List<ReferencePair> generateReferencePairsAgainstDictionary(GATKPath dictionary){
List<ReferencePair> referencePairs = new ArrayList<>();
for(int i = 0; i < references.size(); i++){
GATKPath currRef = references.get(i);
Expand All @@ -274,7 +274,7 @@ public List<ReferencePair> compareAllReferences(){
* @return list of ReferencePairs keyed against the provided dictionary with updated status sets
*/
public List<ReferencePair> compareAgainstKeyReference(GATKPath dictionary){
List<ReferencePair> refPairs = generateReferencePairs(dictionary);
List<ReferencePair> refPairs = generateReferencePairsAgainstDictionary(dictionary);
return analyzeTable(refPairs);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

public class CheckReferenceCompatibilityIntegrationTest extends CommandLineProgramTest {

private final String COMPARE_REFERENCES_TEST_FILES_DIRECTORY = "src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/";
private final String COMPARE_REFERENCES_TEST_FILES_DIRECTORY = toolsTestDir + "/reference/CompareReferences/";

@DataProvider(name = "testReferenceCompatibilityBAMWithMD5sData")
public Object[][] testReferenceCompatibilityBAMWithMD5sData() {
Expand Down Expand Up @@ -97,13 +97,14 @@ public void testReferenceCompatibilityMultipleBAMs() throws IOException {
public void testReferenceCompatibilityMultipleReferencesBAMWithMD5s() throws IOException {
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta");
final File ref2 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_1renamed.fasta");
final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta");
final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_missingchr1.fasta");
final File ref4 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta");
final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withmd5s_missingchr1.bam");
final File output = createTempFile("testReferenceCompatibilityMultipleReferencesWithMD5s", ".table");
final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table");

final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath(), "-refcomp", ref2.getAbsolutePath(),
"-refcomp", ref3.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()};
"-refcomp", ref3.getAbsolutePath(), "-refcomp", ref4.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput);
Expand Down Expand Up @@ -133,23 +134,27 @@ public void testReferenceCompatibilityMultipleReferencesBAMWithoutMD5s() throws
public void testReferenceCompatibilityWithoutMD5sMismatch() throws IOException {
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta");
final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withoutmd5s.bam");
final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table");
final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityWithoutMD5Mismatch.table");

final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", dict.getAbsolutePath()};
final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", dict.getAbsolutePath(), "-O", output.getAbsolutePath()};
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput);
}

// TODO: compatibility based on MD5 faulty since MD5s not in sequence dictionary (see ticket #730 "VCFHeader drops sequence dictionary attributes")
@Test(enabled = false)
// TODO: compatibility based on MD5 faulty since MD5s not in sequence dictionary (https://github.com/samtools/htsjdk/issues/730)
@Test
public void testReferenceCompatibilityVCFWithMD5s() throws IOException {
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_missingchr1.fasta");
final File dict = new File(getToolTestDataDir() + "example_variants_withSequenceDict.vcf");
//final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table");
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta");
final File dict = new File(getToolTestDataDir() + "example_variants_withSequenceDict_withmd5.vcf");
final File output = createTempFile("testReferenceCompatibilityVCFWithMD5s", ".table");
final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityVCFWithMD5s.table");

final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-V", dict.getAbsolutePath()/*, "-O", expectedOutput.getAbsolutePath()*/};
final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-V", dict.getAbsolutePath(), "-O", output.getAbsolutePath()};
runCommandLine(args);

//IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput);
IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput);
}

@DataProvider(name = "testReferenceCompatibilityVCFWithoutMD5sData")
Expand Down Expand Up @@ -187,19 +192,15 @@ public void testReferenceCompatibilityVCFWithoutMD5s(File ref1, File dict, File
IntegrationTestSpec.assertEqualTextFiles(output, expectedOutput);
}

// for quick stdout testing
@Test(enabled = false)
public void testStdOutput() throws IOException{
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini.fasta");
final File ref2 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_1renamed.fasta");
final File ref3 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta");
final File dict = new File(getToolTestDataDir() + "reads_data_source_test1_withmd5s_missingchr1.bam");
//final File output = createTempFile("testReferenceCompatibilityMultipleReferencesWithMD5s", ".table");
final File expectedOutput = new File(getToolTestDataDir(), "expected.testReferenceCompatibilityMultipleReferencesBAMWithMD5s.table");
@Test(expectedExceptions = UserException.BadInput.class)
public void testReferenceCompatibilityBAMandVCF() throws IOException {
final File ref1 = new File(COMPARE_REFERENCES_TEST_FILES_DIRECTORY + "hg19mini_chr2snp.fasta");
final File bam = new File(getToolTestDataDir() + "reads_data_source_test1_withoutmd5s.bam");
final File vcf = new File(getToolTestDataDir() + "example_variants_withSequenceDict_withoutmd5s.vcf");

final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath(), "-refcomp", ref2.getAbsolutePath(),
"-refcomp", ref3.getAbsolutePath(), "-I", dict.getAbsolutePath(), "-O", expectedOutput.getAbsolutePath()};
final String[] args = new String[] {"-refcomp", ref1.getAbsolutePath() , "-I", bam.getAbsolutePath(), "-V", vcf.getAbsolutePath()};
runCommandLine(args);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import htsjdk.samtools.SAMSequenceDictionary;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadsDataSource;
import org.broadinstitute.hellbender.engine.ReadsPathDataSource;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
Expand All @@ -24,9 +26,11 @@ private ReferenceSequenceTable tableGenerator(List<GATKPath> references, Compare
return table;
}

private ReferenceSequenceTable tableGenerator(List<GATKPath> references){
private ReferenceSequenceTable tableGenerator(GATKPath dictionaryPath, SAMSequenceDictionary dictionary, List<GATKPath> references){
Map<GATKPath, SAMSequenceDictionary> dictionaries = new LinkedHashMap<>();
for(GATKPath path : references){

dictionaries.put(dictionaryPath, dictionary);
for (GATKPath path : references) {
dictionaries.put(path, ReferenceDataSource.of(path.toPath()).getSequenceDictionary());
}

Expand Down Expand Up @@ -128,14 +132,39 @@ public Object[][] testGenerateReferencePairsData() {
},
};
}

@Test(dataProvider = "testGenerateReferencePairsData")
public void testGenerateReferencePairs(List<GATKPath> references, int expectedPairs){
ReferenceSequenceTable table = tableGenerator(references, CompareReferences.MD5CalculationMode.USE_DICT);
List<ReferencePair> pairs = table.generateReferencePairs();
Assert.assertEquals(pairs.size(), expectedPairs);
}

@DataProvider(name = "testGenerateReferencePairsAgainstDictionaryData")
public Object[][] testGenerateReferencePairsAgainstDictionaryData() {
return new Object[][]{
// references, dictionary, expected number of pairs
new Object[]{ Arrays.asList(new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini.fasta"),
new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_chr2snp.fasta")),
new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/reads_data_source_test1_withmd5s.bam"),
2
},
new Object[]{ Arrays.asList(new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini.fasta"),
new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_chr2snp.fasta"),
new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CompareReferences/hg19mini_1renamed.fasta")),
new GATKPath("src/test/resources/org/broadinstitute/hellbender/tools/reference/CheckReferenceCompatibility/reads_data_source_test1_withmd5s.bam"),
3
},
};
}
@Test(dataProvider = "testGenerateReferencePairsAgainstDictionaryData")
public void testGenerateReferencePairsAgainstDictionary(List<GATKPath> references, GATKPath dictPath, int expectedPairs){
ReadsDataSource readsDataSource = new ReadsPathDataSource(dictPath.toPath());
SAMSequenceDictionary dictionary = readsDataSource.getSequenceDictionary();
ReferenceSequenceTable table = tableGenerator(dictPath, dictionary, references);
List<ReferencePair> pairs = table.generateReferencePairsAgainstDictionary(dictPath);
Assert.assertEquals(pairs.size(), expectedPairs);
}

@DataProvider(name = "testAnalyzeTableTwoRefsData")
public Object[][] testAnalyzeTableTwoReferencesData(){
return new Object[][]{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withmd5s.bam
Reference Compatibility
hg19mini.fasta Compatible, the sequence dictionaries exactly match
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE The sequence dictionaries exactly match
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withmd5s.bam
Reference Compatibility
hg19mini_missingchr3.fasta Not compatible. Status: [SUPERSET]. Run CompareReferences tool for more information on reference differences.
Reference Compatibility Summary
hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: [SUPERSET]. Run CompareReferences tool for more information on reference differences.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withmd5s_missingchr1.bam
Reference Compatibility
hg19mini.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withoutmd5s.bam
Reference Compatibility
hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withoutmd5s.bam
Reference Compatibility
hg19mini_missingchr3.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
Reference Compatibility Summary
hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: reads_data_source_test1_withoutmd5s_missingchr1.bam
Reference Compatibility
hg19mini.fasta All sequence names and lengths present in the sequence dictionaries match, but reads_data_source_test1_withoutmd5s_missingchr1.bam is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE_SUBSET All sequence names and lengths present in the sequence dictionaries match, but reads_data_source_test1_withoutmd5s_missingchr1.bam is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#Current Reference: reads_data_source_test1_withmd5s_missingchr1.bam
Reference Compatibility
hg19mini.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
hg19mini_1renamed.fasta Compatible, the sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1]
hg19mini_chr2snp.fasta Not compatible. Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini.fasta reference sequence dictionary. Missing sequence(s): [1]
hg19mini_1renamed.fasta COMPATIBLE_SUBSET The sequence dictionary in reads_data_source_test1_withmd5s_missingchr1.bam is a subset of the hg19mini_1renamed.fasta reference sequence dictionary. Missing sequence(s): [chr1]
hg19mini_missingchr1.fasta COMPATIBLE The sequence dictionaries exactly match
hg19mini_chr2snp.fasta NOT_COMPATIBLE Status: [DIFFER_IN_SEQUENCE, DIFFER_IN_SEQUENCES_PRESENT]. Run CompareReferences tool for more information on reference differences.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#Current Reference: reads_data_source_test1_withoutmd5s.bam
Reference Compatibility
hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
hg19mini_1renamed.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
hg19mini_chr2snp.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
hg19mini_1renamed.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
hg19mini_chr2snp.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: example_variants_withSequenceDict.vcf
Reference Compatibility
hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
#Current Reference: example_variants_withSequenceDict_withmd5.vcf
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: example_variants_withSequenceDict_withoutmd5s.vcf
Reference Compatibility
hg19mini.fasta All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: example_variants_withSequenceDict_withoutmd5s.vcf
Reference Compatibility
hg19mini_missingchr3.fasta Not compatible. Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
Reference Compatibility Summary
hg19mini_missingchr3.fasta NOT_COMPATIBLE Status: COMMON_SUBSET. Run CompareReferences tool for more information on reference differences.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#Current Reference: example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf
Reference Compatibility
hg19mini.fasta All sequence names and lengths present in the sequence dictionaries match, but example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Reference Compatibility Summary
hg19mini.fasta COMPATIBLE_SUBSET All sequence names and lengths present in the sequence dictionaries match, but example_variants_withSequenceDict_withoutmd5s_missingchr1.vcf is a subset of hg19mini.fasta. Missing sequence(s): [1]. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#Current Reference: reads_data_source_test1_withoutmd5s.bam
Reference Compatibility Summary
hg19mini_chr2snp.fasta COMPATIBLE All sequence names and lengths match in the sequence dictionaries. Since the MD5s are lacking, we can't confirm there aren't mismatching bases in the references.

0 comments on commit 5648af9

Please sign in to comment.