Skip to content

Commit

Permalink
ScatterIntervals produces interval_list instead of intervals
Browse files Browse the repository at this point in the history
* This matches what picard expects an interval list to be named.
* Added a new --extension argument to allow changing the extension.
* Fixes #5390
  • Loading branch information
lbergelson committed Nov 2, 2018
1 parent 1db9cd5 commit 8c508a0
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
* </pre>
*
* <p>
* The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.intervals,
* 0001-scattered.intervals, 0002-scattered.intervals and so on.
* The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.interval_list,
* 0001-scattered.interval_list, 0002-scattered.interval_list and so on.

This comment has been minimized.

Copy link
@samuelklee

samuelklee Nov 3, 2018

Contributor

Some vestigial snake_case in the line below this.

This comment has been minimized.

Copy link
@lbergelson

lbergelson Nov 5, 2018

Author Member

good catch

* The default --scatter_count is 1 and so this value should be changed to utilize the tool's functionality.
* Specify --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION to avoid splitting input intervals -- that is, the set
* of input intervals is split, but individual intervals are left intact. This may affect results when using assembly-based callers downstream.
Expand All @@ -63,6 +63,10 @@ public class SplitIntervals extends GATKTool {
public static final String SUBDIVISION_MODE_SHORT_NAME = "mode";
public static final String SUBDIVISION_MODE_lONG_NAME = "subdivision-mode";

public static final String INTERVAL_FILE_EXTENSION_FULL_NAME = "extension";

public static final String PICARD_INTERVAL_FILE_EXTENSION = "interval_list";
public static final String DEFAULT_EXTENSION = "-scattered." + PICARD_INTERVAL_FILE_EXTENSION;

@Argument(fullName = SCATTER_COUNT_LONG_NAME, shortName = SCATTER_COUNT_SHORT_NAME,
doc = "scatter count: number of output interval files to split into", optional = true)
Expand All @@ -76,6 +80,9 @@ public class SplitIntervals extends GATKTool {
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
public File outputDir;

@Argument(doc = "Extension to use when writing interval files", fullName = INTERVAL_FILE_EXTENSION_FULL_NAME, optional = true)
public String extension = DEFAULT_EXTENSION;

@Override
public void onTraversalStart() {
ParamUtils.isPositive(scatterCount, "scatter-count must be > 0.");
Expand All @@ -97,7 +104,7 @@ public void onTraversalStart() {
final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);

final DecimalFormat formatter = new DecimalFormat("0000");
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + extension)));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.samtools.SAMSequenceRecord;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
Expand All @@ -12,6 +10,8 @@
import org.testng.annotations.Test;

import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand All @@ -26,6 +26,7 @@ public class SplitIntervalsIntegrationTest extends CommandLineProgramTest {
private static final Path REFERENCE = Paths.get(b37_reference_20_21);
private static final GenomeLocParser GLP = new GenomeLocParser(ReferenceDataSource.of(REFERENCE).getSequenceDictionary());


@Test
public void testOneInterval() {
final int scatterCount = 5;
Expand All @@ -37,8 +38,25 @@ public void testOneInterval() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 1000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION);
}

@Test
public void testOneIntervalAlternateExtension() {
final int scatterCount = 5;
final File outputDir = createTempDir("output");
final String extension = "-scattered.with.a.wierd.extension";
final String[] args = {
"-L", "20:1000000-2000000",
"-R", REFERENCE.toAbsolutePath().toString(),
"-" + SplitIntervals.SCATTER_COUNT_SHORT_NAME, Integer.toString(scatterCount),
"-O", outputDir.getAbsolutePath(),
"--extension", extension
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir, extension);
checkIntervalSizes(scatterCount, outputDir, 1000000, extension);
}

@Test
Expand All @@ -52,8 +70,8 @@ public void testSingleScatter() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 1000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION);

}

Expand All @@ -69,8 +87,8 @@ public void testTwoIntervals() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 2000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 2000000, SplitIntervals.DEFAULT_EXTENSION);

}

Expand All @@ -84,28 +102,28 @@ public void testNoIntervals() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
final int totalLengthInRef = GLP.getSequenceDictionary().getSequences().stream().mapToInt(SAMSequenceRecord::getSequenceLength).sum();
checkIntervalSizes(scatterCount, outputDir, totalLengthInRef);
checkIntervalSizes(scatterCount, outputDir, totalLengthInRef, SplitIntervals.DEFAULT_EXTENSION);

}

private static Stream<File> getScatteredFiles(final int scatterCount, final File outputDir) {
return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + "-scattered.intervals"));
private static Stream<File> getScatteredFiles(final int scatterCount, final File outputDir, String extension) {
return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + extension));
}

private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir) {
getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertTrue(f.exists()));
Assert.assertFalse(new File(outputDir, "000" + scatterCount + "-scattered.intervals").exists());
private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir, String extension) {
getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertTrue(f.exists()));
Assert.assertFalse(new File(outputDir, "000" + scatterCount + extension).exists());
}

private static List<SimpleInterval> readIntervals(final File intervalsFile) {
return IntervalUtils.intervalFileToList(GLP, intervalsFile.getAbsolutePath()).stream().map(SimpleInterval::new).collect(Collectors.toList());
}

private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength) {
private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength, String extension) {
final int splitLength = expectedTotalLength / scatterCount;
getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100));
getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100));
}

}

0 comments on commit 8c508a0

Please sign in to comment.