Skip to content

Commit

Permalink
Write splitting-bai files when writing (non-sharded) BAM files from S…
Browse files Browse the repository at this point in the history
…park. (#2169)

Splitting-bai files are used to find split points in BAM files
so they can be read in parallel without having to heuristically find
suitable split points.
  • Loading branch information
tomwhite authored and lbergelson committed Sep 15, 2016
1 parent 7882c82 commit a30af5a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public static void writeReads(
String absoluteReferenceFile = referenceFile != null ?
BucketUtils.makeFilePathAbsolute(referenceFile) :
referenceFile;
setHadoopBAMConfigurationProperties(ctx, absoluteOutputFile, absoluteReferenceFile);
setHadoopBAMConfigurationProperties(ctx, absoluteOutputFile, absoluteReferenceFile, format);

// The underlying reads are required to be in SAMRecord format in order to be
// written out, so we convert them to SAMRecord explicitly here. If they're already
Expand Down Expand Up @@ -293,11 +293,17 @@ private static void deleteHadoopFile(String fileToObliterate, Configuration conf
* from passing a stale value through to htsjdk when multiple calls are made serially
* with different outputs but the same Spark context
*/
private static void setHadoopBAMConfigurationProperties(final JavaSparkContext ctx, final String outputName, final String referenceName) {
private static void setHadoopBAMConfigurationProperties(final JavaSparkContext ctx, final String outputName,
final String referenceName, final ReadsWriteFormat format) {
final Configuration conf = ctx.hadoopConfiguration();

if (!IOUtils.isCramFileName(outputName)) { // only set the reference for CRAM output
conf.unset(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
if (format == ReadsWriteFormat.SINGLE && IOUtils.isBamFileName(outputName)) {
conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, true);
} else {
conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false);
}
}
else {
if (null == referenceName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
import org.broadinstitute.hellbender.engine.spark.SparkContextFactory;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import org.broadinstitute.hellbender.utils.test.BaseTest;
import org.broadinstitute.hellbender.utils.test.MiniClusterUtils;
import org.seqdoop.hadoop_bam.SplittingBAMIndexer;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
Expand All @@ -27,6 +29,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
Expand Down Expand Up @@ -122,6 +125,11 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi

ReadsSparkSink.writeReads(ctx, outputPath, referenceFile, rddParallelReads, header, ReadsWriteFormat.SINGLE);

// check that a splitting bai file is created
if (IOUtils.isBamFileName(outputPath)) {
Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)));
}

JavaRDD<GATKRead> rddParallelReads2 = readSource.getParallelReads(outputPath, referenceFile);
final List<GATKRead> writtenReads = rddParallelReads2.collect();

Expand Down

0 comments on commit a30af5a

Please sign in to comment.