Skip to content

Commit

Permalink
GenomicsDB matches CombineGVCFs with input spanning deletions (#5397)
Browse files Browse the repository at this point in the history
* The newest release of GenomicsDB treats spanning deletions (spanning
from earlier positions) as deletions in the min PL value computation.
This behavior now matches the behavior of CombineGVCFs.

A more detailed description of the issue is provided in
#4963

* Deleted a couple of files which are no longer necessary.

* Fixed the index of newMQcalc.combined.g.vcf

* Fix for #5300 when multiple
reader-threads are used in the importer. Not a race condition in
GenomicsDB - InitializedQueryWrapper wasn't written for multiple
intervals.
  • Loading branch information
kgururaj authored and droazen committed Jan 16, 2019
1 parent 9b3ee90 commit 05aa6b2
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 11,650 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ final sparkVersion = System.getProperty('spark.version', '2.2.0')
final hadoopVersion = System.getProperty('hadoop.version', '2.8.2')
final disqVersion = System.getProperty('disq.version','0.1.0')
final tensorflowVersion = System.getProperty('tensorflow.version','1.9.0')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.10.0-proto-3.0.0-beta-1+bdce8be25b873')
final genomicsdbVersion = System.getProperty('genomicsdb.version','0.10.2-proto-3.0.0-beta-1+90dad1af8ce0e4d')
final testNGVersion = '6.11'
// Using the shaded version to avoid conflicts between its protobuf dependency
// and that of Hadoop/Spark (either the one we reference explicitly, or the one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ public final class GenomicsDBImport extends GATKTool {
@Argument(fullName = VCF_INITIALIZER_THREADS_LONG_NAME,
shortName = VCF_INITIALIZER_THREADS_LONG_NAME,
doc = "How many simultaneous threads to use when opening VCFs in batches; higher values may improve performance " +
"when network latency is an issue",
"when network latency is an issue. Multiple reader threads are not supported when running with multiple intervals.",
optional = true,
minValue = 1)
private int vcfInitializerThreads = 1;
Expand Down Expand Up @@ -499,11 +499,18 @@ public void onTraversalStart() {

private void initializeInputPreloadExecutorService() {
if( vcfInitializerThreads > 1) {
final ThreadFactory threadFactory = new ThreadFactoryBuilder()
if( intervals.size() == 1) {
final ThreadFactory threadFactory = new ThreadFactoryBuilder()
.setNameFormat("readerInitializer-thread-%d")
.setDaemon(true)
.build();
this.inputPreloadExecutorService = Executors.newFixedThreadPool(vcfInitializerThreads, threadFactory);
this.inputPreloadExecutorService = Executors.newFixedThreadPool(vcfInitializerThreads, threadFactory);
}
else {
logger.warn("GenomicsDBImport cannot use multiple VCF reader threads for initialization when the "
+ "number of intervals is greater than 1. Falling back to serial VCF reader initialization.");
inputPreloadExecutorService = null;
}
} else {
inputPreloadExecutorService = null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
private static final String NA_12878_PHASED = largeFileTestDir + "NA12878.phasedData.Chr20.vcf"; //NOTE: this is not phased according to the vcf spec but it reflects phasing currently produced by haplotype caller
private static final String MULTIPLOID_DATA_HG37 = largeFileTestDir + "gvcfs/HapMap5plex.ploidy10.b37.g.vcf";
private static final String NA12878_HG37 = toolsTestDir + "GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf";
//This file was generated by running CombineGVCFs on the input files
//./gatk CombineGVCFs -V src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf -V src/test/resources/large/gvcfs/HapMap5plex.ploidy10.b37.g.vcf -R src/test/resources/large/human_g1k_v37.20.21.fasta -L 20:10000000-10100000 -O src/test/resources/org/broadinstitute/hellbender/tools/GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf
private static final String MULTIPLOID_EXPECTED_RESULT = toolsTestDir + "GenomicsDBImport/expected.testGenomicsDBImportWithNonDiploidData.vcf";
private static final String MNP_GVCF = toolsTestDir + "GenomicsDBImport/mnp.input.g.vcf";
private static final String ARTIFICIAL_PHASED = getTestDataDir() + "/ArtificalPhasedData.1.g.vcf";
Expand Down Expand Up @@ -112,8 +114,8 @@ public final class GenomicsDBImportIntegrationTest extends CommandLineProgramTes
private static final String SAMPLE_NAME_KEY = "SN";
private static final String ANOTHER_ATTRIBUTE_KEY = "AA";

private static final List<String> GVCFS_WITH_NEW_MQ = Arrays.asList(toolsTestDir + "/GenomicsDBImport/expected.testGVCFMode.gatk4.g.vcf", getTestDataDir() + "/walkers/CombineGVCFs/YRIoffspring.chr20snippet.g.vcf");
private static final String COMBINED_WITH_NEW_MQ = toolsTestDir + "/walkers/GenomicsDBImport/newMQcalc.combined.g.vcf";
private static final List<String> GVCFS_WITH_NEW_MQ = Arrays.asList(NA12878_HG37, getTestDataDir() + "/walkers/CombineGVCFs/YRIoffspring.chr20snippet.g.vcf");
private static final String COMBINED_WITH_NEW_MQ = toolsTestDir + "/walkers/CombineGVCFs/newMQcalc.combined.g.vcf";
private static final List<SimpleInterval> INTERVAL2 = Arrays.asList(new SimpleInterval("20", 1, 11_000_000));
private static final List<String> ATTRIBUTES_TO_IGNORE = Arrays.asList("RAW_MQ","RAW_MQandDP"); //CombineGVCFs doesn't support the old RAW_MQ anymore

Expand Down Expand Up @@ -170,6 +172,12 @@ public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleInterval
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21, new String[0]);
}

@Test
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleIntervalsWithMultipleThreads() throws IOException {
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS, b38_reference_20_21,
new String[0], 4);
}

@Test
public void testGenomicsDBImportFileInputsAgainstCombineGVCFWithMultipleNonAdjacentIntervals() throws IOException {
testGenomicsDBAgainstCombineGVCFs(LOCAL_GVCFS, MULTIPLE_NON_ADJACENT_INTERVALS_THAT_WORK_WITH_COMBINE_GVCFS,
Expand Down Expand Up @@ -288,9 +296,15 @@ private File runCombineGVCFs(final List<String> inputs, final List<SimpleInterva

private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
final String referenceFile, final String[] CombineGVCFArgs) throws IOException {
testGenomicsDBAgainstCombineGVCFs(vcfInputs, intervals, referenceFile, CombineGVCFArgs, 1);
}

private void testGenomicsDBAgainstCombineGVCFs(final List<String> vcfInputs, final List<SimpleInterval> intervals,
final String referenceFile, final String[] CombineGVCFArgs,
final int numVCFReaderThreadsInImporter) throws IOException {
final String workspace = createTempDir("genomicsdb-tests-").getAbsolutePath() + "/workspace";

writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, 1);
writeToGenomicsDB(vcfInputs, intervals, workspace, 0, false, 0, numVCFReaderThreadsInImporter);
checkJSONFilesAreWritten(workspace);
for(SimpleInterval currInterval : intervals) {
List<SimpleInterval> tmpList = new ArrayList<SimpleInterval>(Arrays.asList(currInterval));
Expand Down
Loading

0 comments on commit 05aa6b2

Please sign in to comment.