Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ScatterIntervals produces interval_list instead of intervals #5392

Merged
merged 1 commit into from
Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ task SplitIntervals {
}

output {
Array[File] interval_files = glob("*.intervals")
Array[File] interval_files = glob("*.interval_list")
}
}

Expand Down
4 changes: 2 additions & 2 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ task SplitIntervals {
-scatter ${scatter_count} \
-O interval-files \
${split_intervals_extra_args}
cp interval-files/*.intervals .
cp interval-files/*.interval_list .
}

runtime {
Expand All @@ -502,7 +502,7 @@ task SplitIntervals {
}

output {
Array[File] interval_files = glob("*.intervals")
Array[File] interval_files = glob("*.interval_list")
}
}

Expand Down
4 changes: 2 additions & 2 deletions scripts/mutect2_wdl/mutect2_nio.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ task SplitIntervals {
-scatter ${scatter_count} \
-O interval-files \
${split_intervals_extra_args}
cp interval-files/*.intervals .
cp interval-files/*.interval_list .
}

runtime {
Expand All @@ -447,7 +447,7 @@ task SplitIntervals {
}

output {
Array[File] interval_files = glob("*.intervals")
Array[File] interval_files = glob("*.interval_list")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
* </pre>
*
* <p>
* The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.intervals,
* 0001-scattered.intervals, 0002-scattered.intervals and so on.
* The default --scatter_count is 1 and so this value should be changed to utilize the tool's functionality.
* The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.interval_list,
* 0001-scattered.interval_list, 0002-scattered.interval_list and so on.
* The default --scatter-count is 1 and so this value should be changed to utilize the tool's functionality.
* Specify --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION to avoid splitting input intervals -- that is, the set
* of input intervals is split, but individual intervals are left intact. This may affect results when using assembly-based callers downstream.
* </p>
Expand All @@ -63,6 +63,10 @@ public class SplitIntervals extends GATKTool {
public static final String SUBDIVISION_MODE_SHORT_NAME = "mode";
public static final String SUBDIVISION_MODE_lONG_NAME = "subdivision-mode";

public static final String INTERVAL_FILE_EXTENSION_FULL_NAME = "extension";

public static final String PICARD_INTERVAL_FILE_EXTENSION = "interval_list";
public static final String DEFAULT_EXTENSION = "-scattered." + PICARD_INTERVAL_FILE_EXTENSION;

@Argument(fullName = SCATTER_COUNT_LONG_NAME, shortName = SCATTER_COUNT_SHORT_NAME,
doc = "scatter count: number of output interval files to split into", optional = true)
Expand All @@ -76,6 +80,9 @@ public class SplitIntervals extends GATKTool {
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
public File outputDir;

@Argument(doc = "Extension to use when writing interval files", fullName = INTERVAL_FILE_EXTENSION_FULL_NAME, optional = true)
public String extension = DEFAULT_EXTENSION;

@Override
public void onTraversalStart() {
ParamUtils.isPositive(scatterCount, "scatter-count must be > 0.");
Expand All @@ -97,7 +104,7 @@ public void onTraversalStart() {
final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);

final DecimalFormat formatter = new DecimalFormat("0000");
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + extension)));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.samtools.SAMSequenceRecord;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
Expand All @@ -12,6 +10,8 @@
import org.testng.annotations.Test;

import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand All @@ -26,6 +26,7 @@ public class SplitIntervalsIntegrationTest extends CommandLineProgramTest {
private static final Path REFERENCE = Paths.get(b37_reference_20_21);
private static final GenomeLocParser GLP = new GenomeLocParser(ReferenceDataSource.of(REFERENCE).getSequenceDictionary());


@Test
public void testOneInterval() {
final int scatterCount = 5;
Expand All @@ -37,8 +38,25 @@ public void testOneInterval() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 1000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION);
}

@Test
public void testOneIntervalAlternateExtension() {
final int scatterCount = 5;
final File outputDir = createTempDir("output");
final String extension = "-scattered.with.a.wierd.extension";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Obviously Picard won't read this in, but would the GATK engine? I'm a little worried about giving users the (BB) gun to shoot themselves in the foot.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured we'd want a way to let users specify the old pattern if they want.

final String[] args = {
"-L", "20:1000000-2000000",
"-R", REFERENCE.toAbsolutePath().toString(),
"-" + SplitIntervals.SCATTER_COUNT_SHORT_NAME, Integer.toString(scatterCount),
"-O", outputDir.getAbsolutePath(),
"--extension", extension
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir, extension);
checkIntervalSizes(scatterCount, outputDir, 1000000, extension);
}

@Test
Expand All @@ -52,8 +70,8 @@ public void testSingleScatter() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 1000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 1000000, SplitIntervals.DEFAULT_EXTENSION);

}

Expand All @@ -69,8 +87,8 @@ public void testTwoIntervals() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
checkIntervalSizes(scatterCount, outputDir, 2000000);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
checkIntervalSizes(scatterCount, outputDir, 2000000, SplitIntervals.DEFAULT_EXTENSION);

}

Expand All @@ -84,28 +102,28 @@ public void testNoIntervals() {
"-O", outputDir.getAbsolutePath()
};
runCommandLine(args);
verifyScatteredFilesExist(scatterCount, outputDir);
verifyScatteredFilesExist(scatterCount, outputDir, SplitIntervals.DEFAULT_EXTENSION);
final int totalLengthInRef = GLP.getSequenceDictionary().getSequences().stream().mapToInt(SAMSequenceRecord::getSequenceLength).sum();
checkIntervalSizes(scatterCount, outputDir, totalLengthInRef);
checkIntervalSizes(scatterCount, outputDir, totalLengthInRef, SplitIntervals.DEFAULT_EXTENSION);

}

private static Stream<File> getScatteredFiles(final int scatterCount, final File outputDir) {
return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + "-scattered.intervals"));
private static Stream<File> getScatteredFiles(final int scatterCount, final File outputDir, String extension) {
return IntStream.range(0, scatterCount).mapToObj(n -> new File(outputDir, "000" + n + extension));
}

private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir) {
getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertTrue(f.exists()));
Assert.assertFalse(new File(outputDir, "000" + scatterCount + "-scattered.intervals").exists());
private static void verifyScatteredFilesExist(final int scatterCount, final File outputDir, String extension) {
getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertTrue(f.exists()));
Assert.assertFalse(new File(outputDir, "000" + scatterCount + extension).exists());
}

private static List<SimpleInterval> readIntervals(final File intervalsFile) {
return IntervalUtils.intervalFileToList(GLP, intervalsFile.getAbsolutePath()).stream().map(SimpleInterval::new).collect(Collectors.toList());
}

private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength) {
private static void checkIntervalSizes(final int scatterCount, final File outputDir, final int expectedTotalLength, String extension) {
final int splitLength = expectedTotalLength / scatterCount;
getScatteredFiles(scatterCount, outputDir).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100));
getScatteredFiles(scatterCount, outputDir, extension).forEach(f -> Assert.assertEquals(readIntervals(f).stream().mapToInt(SimpleInterval::size).sum(), splitLength, 100));
}

}