From 1ac0038b0b4d36769b74518db2eb20102956441f Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 2 Nov 2018 17:48:14 -0400 Subject: [PATCH] ScatterIntervals produces interval_list instead of intervals * This matches what picard expects an interval list to be named. * Added a new --extension argument to allow changing the extension. * Fixes https://github.com/broadinstitute/gatk/issues/5390 --- .../cnn_variant_common_tasks.wdl | 2 +- scripts/mutect2_wdl/mutect2.wdl | 4 +- scripts/mutect2_wdl/mutect2_nio.wdl | 4 +- .../tools/walkers/SplitIntervals.java | 15 ++++-- .../SplitIntervalsIntegrationTest.java | 52 +++++++++++++------ 5 files changed, 51 insertions(+), 26 deletions(-) diff --git a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl b/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl index f0b13165809..ddf690f8574 100644 --- a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl +++ b/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl @@ -234,7 +234,7 @@ task SplitIntervals { } output { - Array[File] interval_files = glob("*.intervals") + Array[File] interval_files = glob("*.interval_list") } } diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index c238768fedd..dce0e8d52a3 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -488,7 +488,7 @@ task SplitIntervals { -scatter ${scatter_count} \ -O interval-files \ ${split_intervals_extra_args} - cp interval-files/*.intervals . + cp interval-files/*.interval_list . } runtime { @@ -502,7 +502,7 @@ task SplitIntervals { } output { - Array[File] interval_files = glob("*.intervals") + Array[File] interval_files = glob("*.interval_list") } } diff --git a/scripts/mutect2_wdl/mutect2_nio.wdl b/scripts/mutect2_wdl/mutect2_nio.wdl index 2f30bfd1b75..9ed7ad34524 100755 --- a/scripts/mutect2_wdl/mutect2_nio.wdl +++ b/scripts/mutect2_wdl/mutect2_nio.wdl @@ -433,7 +433,7 @@ task SplitIntervals { -scatter ${scatter_count} \ -O interval-files \ ${split_intervals_extra_args} - cp interval-files/*.intervals . + cp interval-files/*.interval_list . } runtime { @@ -447,7 +447,7 @@ task SplitIntervals { } output { - Array[File] interval_files = glob("*.intervals") + Array[File] interval_files = glob("*.interval_list") } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervals.java index 3fa46920299..a8221c60b9a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervals.java @@ -41,9 +41,9 @@ * * *

- * The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.intervals, - * 0001-scattered.intervals, 0002-scattered.intervals and so on. - * The default --scatter_count is 1 and so this value should be changed to utilize the tool's functionality. + * The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.interval_list, + * 0001-scattered.interval_list, 0002-scattered.interval_list and so on. + * The default --scatter-count is 1 and so this value should be changed to utilize the tool's functionality. * Specify --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION to avoid splitting input intervals -- that is, the set * of input intervals is split, but individual intervals are left intact. This may affect results when using assembly-based callers downstream. *

@@ -63,6 +63,10 @@ public class SplitIntervals extends GATKTool { public static final String SUBDIVISION_MODE_SHORT_NAME = "mode"; public static final String SUBDIVISION_MODE_lONG_NAME = "subdivision-mode"; + public static final String INTERVAL_FILE_EXTENSION_FULL_NAME = "extension"; + + public static final String PICARD_INTERVAL_FILE_EXTENSION = "interval_list"; + public static final String DEFAULT_EXTENSION = "-scattered." + PICARD_INTERVAL_FILE_EXTENSION; @Argument(fullName = SCATTER_COUNT_LONG_NAME, shortName = SCATTER_COUNT_SHORT_NAME, doc = "scatter count: number of output interval files to split into", optional = true) @@ -76,6 +80,9 @@ public class SplitIntervals extends GATKTool { shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) public File outputDir; + @Argument(doc = "Extension to use when writing interval files", fullName = INTERVAL_FILE_EXTENSION_FULL_NAME, optional = true) + public String extension = DEFAULT_EXTENSION; + @Override public void onTraversalStart() { ParamUtils.isPositive(scatterCount, "scatter-count must be > 0."); @@ -97,7 +104,7 @@ public void onTraversalStart() { final List scattered = scatterer.scatter(intervalList, scatterCount, false); final DecimalFormat formatter = new DecimalFormat("0000"); - IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals"))); + IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + extension))); } @Override diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervalsIntegrationTest.java index 765c507ebaf..a11dde00b94 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/SplitIntervalsIntegrationTest.java @@ -1,8 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers; import htsjdk.samtools.SAMSequenceRecord; -import java.nio.file.Path; -import java.nio.file.Paths; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.utils.GenomeLocParser; @@ -12,6 +10,8 @@ import org.testng.annotations.Test; import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -26,6 +26,7 @@ public class SplitIntervalsIntegrationTest extends CommandLineProgramTest { private static final Path REFERENCE = Paths.get(b37_reference_20_21); private static final GenomeLocParser GLP = new GenomeLocParser(ReferenceDataSource.of(REFERENCE).getSequenceDictionary()); + @Test public void testOneInterval() { final int scatterCount = 5; @@ -37,8 +38,25 @@ public void testOneInterval() { "-O", outputDir.getAbsolutePath() }; runCommandLine(args); - verifyScatteredFilesExist(scatterCount, outputDir); - checkIntervalSizes(scatterCount, outputDir, 1000000); + verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION); + checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION); + } + + @Test + public void testOneIntervalAlternateExtension() { + final int scatterCount = 5; + final File outputDir = createTempDir("output"); + final String extension = "-scattered.with.a.wierd.extension"; + final String[] args = { + "-L", "20:1000000-2000000", + "-R", REFERENCE.toAbsolutePath().toString(), + "-" + SplitIntervals.SCATTER_COUNT_SHORT_NAME, Integer.toString(scatterCount), + "-O", outputDir.getAbsolutePath(), + "--extension", extension + }; + runCommandLine(args); + verifyScatteredFilesExist(scatterCount, outputDir, extension); + checkIntervalSizes(scatterCount, outputDir, 1000000, extension); } @Test @@ -52,8 +70,8 @@ public void testSingleScatter() { "-O", outputDir.getAbsolutePath() }; runCommandLine(args); - verifyScatteredFilesExist(scatterCount, outputDir); - checkIntervalSizes(scatterCount, outputDir, 1000000); + verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION); + checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION); } @@ -69,8 +87,8 @@ public void testTwoIntervals() { "-O", outputDir.getAbsolutePath() }; runCommandLine(args); - verifyScatteredFilesExist(scatterCount, outputDir); - checkIntervalSizes(scatterCount, outputDir, 2000000); + verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION); + checkIntervalSizes(scatterCount, outputDir, 2000000, SplitIntervals.DEFAULT_EXTENSION); } @@ -84,28 +102,28 @@ public void testNoIntervals() { "-O", outputDir.getAbsolutePath() }; runCommandLine(args); - verifyScatteredFilesExist(scatterCount, outputDir); + verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION); final int totalLengthInRef = GLP.getSequenceDictionary().getSequences().stream().mapToInt(SAMSequenceRecord::getSequenceLength).sum(); - checkIntervalSizes(scatterCount, outputDir, totalLengthInRef); + checkIntervalSizes(scatterCount, outputDir, totalLengthInRef, SplitIntervals.DEFAULT_EXTENSION); } - private static Stream getScatteredFiles(final int scatterCount, final File outputDir) { - return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + "-scattered.intervals")); + private static Stream getScatteredFiles(final int scatterCount, final File outputDir, String extension) { + return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + extension)); } - private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir) { - getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertTrue(f.exists())); - Assert.assertFalse(new File(outputDir, "000" + scatterCount + "-scattered.intervals").exists()); + private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir, String extension) { + getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertTrue(f.exists())); + Assert.assertFalse(new File(outputDir, "000" + scatterCount + extension).exists()); } private static List readIntervals(final File intervalsFile) { return IntervalUtils.intervalFileToList(GLP, intervalsFile.getAbsolutePath()).stream().map(SimpleInterval::new).collect(Collectors.toList()); } - private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength) { + private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength, String extension) { final int splitLength = expectedTotalLength / scatterCount; - getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100)); + getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100)); } } \ No newline at end of file