Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cloud support for BuildBamIndex outputs. #1925

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/main/java/picard/nio/PicardHtsPath.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ public PicardHtsPath(final String rawInputString) {
super(rawInputString);
}

public PicardHtsPath(final String directory, final String file) {
super(directory + file);
}

/**
* Create a PicardHtsPath from an existing {@link HtsPath} or subclass.
*
Expand Down Expand Up @@ -179,10 +183,26 @@ public static PicardHtsPath replaceExtension(final IOPath path, final String new
return PicardHtsPath.fromPath(path.toPath().resolveSibling(newFileName));
}

public static PicardHtsPath replaceExtension(final IOPath path, final String newExtension){
return replaceExtension(path, newExtension, false);
}


/**
* Wrapper for Path.resolve()
*/
public static PicardHtsPath resolve(final PicardHtsPath absPath, final String relativePath){
return PicardHtsPath.fromPath(absPath.toPath().resolve(relativePath));
}

/**
* Wrapper for Path.resolveSibling()
*/
public static PicardHtsPath resolveSibling(final PicardHtsPath absPath, final String other){
return PicardHtsPath.fromPath(absPath.toPath().resolveSibling(other));
}

public boolean isLocalPath(){
return getScheme().equals(PicardBucketUtils.FILE_SCHEME);
}
}
62 changes: 31 additions & 31 deletions src/main/java/picard/sam/BuildBamIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,32 @@
package picard.sam;

import htsjdk.samtools.BAMIndexer;
import htsjdk.samtools.CRAMCRAIIndexer;
import htsjdk.samtools.CRAMIndexer;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.FileExtensions;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.utils.ValidationUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import picard.nio.PicardHtsPath;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

/**
Expand All @@ -69,58 +77,50 @@ public class BuildBamIndex extends CommandLineProgram {
private static final Log log = Log.getInstance(BuildBamIndex.class);

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc = "A BAM file or GA4GH URL to process. Must be sorted in coordinate order.")
doc = "A BAM file or GA4GH URL to process. Must be sorted in coordinate order.") // tsato: Is the GA4GH bit accurate?
public PicardHtsPath INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
doc = "The BAM index file. Defaults to x.bai if INPUT is x.bam, otherwise INPUT.bai.\n" +
"If INPUT is a URL and OUTPUT is unspecified, defaults to a file in the current directory.", optional = true)
public File OUTPUT;
public PicardHtsPath OUTPUT;

/**
* Main method for the program. Checks that all input files are present and
* readable and that the output file can be written to. Then iterates through
* all the records generating a BAM Index, then writes the bai file.
*/
protected int doWork() {
final Path inputPath = INPUT.toPath();
ValidationUtils.validateArg(INPUT.hasExtension(FileExtensions.BAM) || INPUT.hasExtension(FileExtensions.CRAM),
"only BAM and CRAM files are supported. INPUT = " + INPUT);

// set default output file - input-file.bai
if (OUTPUT == null) {

final String baseFileName = inputPath.getFileName().toString();

// only BAI indices can be created for now, although CSI indices can be read as well
if (baseFileName.endsWith(FileExtensions.BAM)) {

final int index = baseFileName.lastIndexOf('.');
OUTPUT = new File(baseFileName.substring(0, index) + FileExtensions.BAI_INDEX);

} else {
OUTPUT = new File(baseFileName + FileExtensions.BAI_INDEX);
}
}

IOUtil.assertFileIsWritable(OUTPUT);
final SamReader bam;


bam = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE)
bam = SamReaderFactory.makeDefault().referenceSequence(referenceSequence.getReferencePath())
.disable(SamReaderFactory.Option.EAGERLY_DECODE)
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS)
.open(SamInputResource.of(inputPath));
.open(SamInputResource.of(INPUT.toPath()));
ValidationUtils.validateArg(bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate),
"Input bam file must be sorted by coordinates");

if (bam.type() != SamReader.Type.BAM_TYPE && bam.type() != SamReader.Type.BAM_CSI_TYPE) {
throw new SAMException("Input file must be bam file, not sam file.");
// set default output file - <input-file>.bai or <input-file>.crai
if (OUTPUT == null) {
final String extension = INPUT.hasExtension(FileExtensions.BAM) ? FileExtensions.BAI_INDEX : FileExtensions.CRAM_INDEX;
OUTPUT = PicardHtsPath.replaceExtension(INPUT, extension);
}

if (!bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
throw new SAMException("Input bam file must be sorted by coordinate");
if (bam.type() == SamReader.Type.BAM_TYPE || bam.type() == SamReader.Type.BAM_CSI_TYPE){
BAMIndexer.createIndex(bam, OUTPUT.toPath());
} else if (bam.type() == SamReader.Type.CRAM_TYPE){
try (SeekableStream seekableStream = new SeekablePathStream(INPUT.toPath())){
CRAMCRAIIndexer.writeIndex(seekableStream, Files.newOutputStream(OUTPUT.toPath()));
} catch (IOException e){
throw new SAMException("Unable to write the CRAM Index " + OUTPUT);
}
} else {
throw new PicardException("Unsupported file type: " + INPUT);
}

BAMIndexer.createIndex(bam, OUTPUT);

log.info("Successfully wrote bam index file " + OUTPUT);
log.info("Successfully wrote bam index file " + OUTPUT); // tsato: bam -> SAM
CloserUtil.close(bam);
return 0;
}
Expand Down
20 changes: 13 additions & 7 deletions src/main/java/picard/sam/SortSam.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
Expand All @@ -40,6 +41,8 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import picard.nio.PicardBucketUtils;
import picard.nio.PicardHtsPath;

import java.io.File;

Expand Down Expand Up @@ -101,10 +104,10 @@ public class SortSam extends CommandLineProgram {

"<hr />";
@Argument(doc = "The SAM, BAM or CRAM file to sort.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;
public PicardHtsPath INPUT;

@Argument(doc = "The sorted SAM, BAM or CRAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME)
public File OUTPUT;
public PicardHtsPath OUTPUT;

// note that SortOrder here is a local enum, not the SamFileHeader version.
@Argument(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, doc = "Sort order of output file. ")
Expand Down Expand Up @@ -149,12 +152,15 @@ public String getHelpDoc() {


protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
;
IOUtil.assertFileIsReadable(INPUT.toPath());
if (INPUT.getScheme().equals(PicardBucketUtils.FILE_SCHEME)){
IOUtil.assertFileIsWritable(OUTPUT.toPath());
}

final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceSequence.getReferencePath()).open(SamInputResource.of(INPUT.toPath())); // tsato: needs to be wrapped in SamInputResource.of() ??

reader.getFileHeader().setSortOrder(SORT_ORDER.getSortOrder());
final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(reader.getFileHeader(), false, OUTPUT, REFERENCE_SEQUENCE);
final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(reader.getFileHeader(), false, OUTPUT.toPath(), referenceSequence.getReferencePath());
writer.setProgressLogger(
new ProgressLogger(log, (int) 1e7, "Wrote", "records from a sorting collection"));

Expand Down
147 changes: 120 additions & 27 deletions src/test/java/picard/sam/BuildBamIndexTest.java
Original file line number Diff line number Diff line change
@@ -1,72 +1,165 @@
package picard.sam;

import htsjdk.beta.io.IOPathUtils;
import htsjdk.io.IOPath;
import htsjdk.samtools.CRAMCRAIIndexer;
import htsjdk.samtools.SAMException;
import org.apache.commons.io.FileUtils;
import htsjdk.samtools.cram.CRAIEntry;
import htsjdk.samtools.cram.CRAIIndex;
import org.testng.Assert;
import org.testng.annotations.AfterTest;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.cmdline.CommandLineProgramTest;
import picard.nio.PicardBucketUtils;
import picard.nio.PicardHtsPath;
import picard.nio.PicardIOUtils;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

public class BuildBamIndexTest extends CommandLineProgramTest {

private static final File TEST_DATA_DIR = new File("testdata/picard/indices/");
private static final PicardHtsPath INPUT_FILE = new PicardHtsPath(new File(TEST_DATA_DIR, "index_test.sam").getPath());
private static final File OUTPUT_SORTED_FILE = new File(TEST_DATA_DIR, "index_test_sorted.bam");
private static final File OUTPUT_UNSORTED_FILE = new File(TEST_DATA_DIR, "/index_test_unsorted.bam");
private static final File OUTPUT_INDEX_FILE = new File(TEST_DATA_DIR, "/index_test.bam.bai");
private static final PicardHtsPath INPUT_UNSORTED_SAM = new PicardHtsPath(new File(TEST_DATA_DIR, "index_test.sam"));
private static final File EXPECTED_BAI_FILE = new File(TEST_DATA_DIR, "index_test_b.bam.bai");

private static final String CLOUD_TEST_DATA_DIR = "gs://hellbender/test/resources/picard/BuildBamIndex/";
private static final String CLOUD_TEST_OUTPUT_DIR = "gs://hellbender-test-logs/staging/picard/test/BuildBamIndex/";
// tsato: shouldn't we have a constructor for this?
private static final PicardHtsPath INPUT_UNSORTED_SAM_CLOUD = new PicardHtsPath(CLOUD_TEST_DATA_DIR, "index_test.sam");

// tsato: replace with variables defined in the other branches once they merge
private static final PicardHtsPath SORTED_CRAM_CLOUD = new PicardHtsPath("gs://hellbender/test/resources/picard/BuildBamIndex/CEUTrio.HiSeq.WGS.b37.NA12878.20.21_n10000.cram");
// tsato: this directory missing a crai...
private static final PicardHtsPath SORTED_CRAM = new PicardHtsPath("/Users/tsato/workspace/picard/testdata/picard/test/CEUTrio.HiSeq.WGS.b37.NA12878.20.21_n10000.cram");
private static final PicardHtsPath SORTED_CRAM2 = new PicardHtsPath("/Users/tsato/workspace/picard/testdata/picard/test/CEUTrio.HiSeq.WGS.b37.NA12878.20.21_n10000.cram");



public String getCommandLineProgramName() { return BuildBamIndex.class.getSimpleName(); }

@DataProvider(name = "buildBamIndexTestData")
public Object[][] getBuildBamIndexTestData(){
return new Object[][]{
{INPUT_UNSORTED_SAM, true},
{INPUT_UNSORTED_SAM, false},
{INPUT_UNSORTED_SAM_CLOUD, true},
{INPUT_UNSORTED_SAM_CLOUD, false}};
}

// tsato: must add cram test...

// Test that the index file for a sorted BAM is created
@Test
public void testBuildBamIndexOK() throws IOException {
final List<String> args = new ArrayList<>();
/* First sort, before indexing */
@Test(dataProvider = "buildBamIndexTestData")
public void testBuildBamIndexOK(final PicardHtsPath inputUnsortedSam, final boolean specifyOutput) throws IOException {
final boolean cloud = ! inputUnsortedSam.isLocalPath();
final String prefix = "index_test_sorted";
final PicardHtsPath sortedBAM = cloud ? PicardBucketUtils.getTempFilePath(CLOUD_TEST_OUTPUT_DIR, prefix,".bam") :
PicardBucketUtils.getTempFilePath(null, prefix, ".bam");

/* First sort, before indexing */ // tsato: do we need to do this dynamically?
new SortSam().instanceMain(new String[]{
"I=" + INPUT_FILE,
"O=" + OUTPUT_SORTED_FILE,
"I=" + inputUnsortedSam,
"O=" + sortedBAM,
"SORT_ORDER=coordinate"});

args.add("INPUT=" + OUTPUT_SORTED_FILE);
args.add("OUTPUT=" + OUTPUT_INDEX_FILE);
final List<String> args = new ArrayList<>();
args.add("INPUT=" + sortedBAM);
final PicardHtsPath indexOutput;

if (specifyOutput) {
indexOutput = cloud ? PicardBucketUtils.getTempFilePath(CLOUD_TEST_OUTPUT_DIR, prefix, ".bai") :
PicardBucketUtils.getTempFilePath(null, prefix,".bai");
args.add("OUTPUT=" + indexOutput);
} else {
indexOutput = PicardHtsPath.replaceExtension(sortedBAM, ".bai", false);
PicardIOUtils.deleteOnExit(indexOutput.toPath());
}

runPicardCommandLine(args);
Assert.assertEquals(FileUtils.readFileToByteArray(OUTPUT_INDEX_FILE), FileUtils.readFileToByteArray(EXPECTED_BAI_FILE));
Assert.assertEquals(Files.readAllBytes(indexOutput.toPath()), Files.readAllBytes(EXPECTED_BAI_FILE.toPath()));
}

// Test that the index creation fails when presented with a SAM file
@Test(expectedExceptions = SAMException.class)
public void testBuildSamIndexFail() {
final List<String> args = new ArrayList<>();
args.add("INPUT=" + INPUT_FILE);
args.add("OUTPUT=" + OUTPUT_INDEX_FILE);
args.add("INPUT=" + INPUT_UNSORTED_SAM);
runPicardCommandLine(args);
}

// Test that the index creation fails when presented with an unsorted BAM file
@Test(expectedExceptions = SAMException.class)
public void testBuildBamIndexFail() {
final List<String> args = new ArrayList<>();
final IOPath unsortedBAM = IOPathUtils.createTempPath("index_test_sorted", ".bam");
new SamFormatConverter().instanceMain(new String[]{
"INPUT=" + INPUT_FILE,
"OUTPUT=" + OUTPUT_UNSORTED_FILE});
"INPUT=" + INPUT_UNSORTED_SAM,
"OUTPUT=" + unsortedBAM});

args.add("INPUT=" + OUTPUT_UNSORTED_FILE);
args.add("OUTPUT=" + OUTPUT_INDEX_FILE);
final List<String> args = new ArrayList<>();
args.add("INPUT=" + unsortedBAM);
runPicardCommandLine(args);
}

@AfterTest
public void cleanup() throws IOException {
FileUtils.forceDeleteOnExit(OUTPUT_INDEX_FILE);
FileUtils.forceDeleteOnExit(OUTPUT_SORTED_FILE);
FileUtils.forceDeleteOnExit(OUTPUT_UNSORTED_FILE);
@DataProvider(name = "cramTestData")
public Object[][] getCramTestData(){
// tsato: replace with variable
final PicardHtsPath localRef = new PicardHtsPath("/Users/tsato/workspace/picard/testdata/picard/reference/human_g1k_v37.20.21.fasta");
final PicardHtsPath cloudRef = new PicardHtsPath("gs://hellbender/test/resources/picard/references/human_g1k_v37.20.21.fasta");

return new Object[][]{
{SORTED_CRAM, localRef}};
// {SORTED_CRAM, cloudRef}}; // ,
// {SORTED_CRAM_CLOUD, localRef}, // tsato: these are too slow, disable for now.
// {SORTED_CRAM_CLOUD, cloudRef},
}

@Test(dataProvider = "cramTestData")
public void testCram(final PicardHtsPath cram, final PicardHtsPath reference) throws IOException {
final List<String> args = new ArrayList<>();
args.add("INPUT=" + cram);
args.add("REFERENCE_SEQUENCE=" + reference);

final PicardHtsPath indexOutput;
final String prefix = "BuildBamIndex_cram_test";

final boolean specifyOutput = false; // for now
if (specifyOutput) {
indexOutput = PicardBucketUtils.getTempFilePath(CLOUD_TEST_OUTPUT_DIR, prefix, ".crai");
args.add("OUTPUT=" + indexOutput);
} else {
indexOutput = PicardHtsPath.replaceExtension(cram, ".crai", false);
// tsato: temporarily disable while investigating cram index created this way
// PicardIOUtils.deleteOnExit(indexOutput.toPath());
}

runPicardCommandLine(args);

final CRAIIndex craiIndex = CRAMCRAIIndexer.readIndex(Files.newInputStream(indexOutput.toPath()));
final List<CRAIEntry> entries = craiIndex.getCRAIEntries();
// Let's start with this

// *** CORRECT ONES ***
final CRAIIndex craiIndex2 = CRAMCRAIIndexer.readIndex(
Files.newInputStream(new File("/Users/tsato/workspace/picard/testdata/picard/test/CEUTrio.HiSeq.WGS.b37.NA12878.20.21_n10000.cram.crai.samtools_save").toPath()));
final List<CRAIEntry> entries2 = craiIndex2.getCRAIEntries();
// *** END CORRECT ONES ***

Assert.assertEquals(entries, entries2); // better test but uses a premade samtools index
Assert.assertEquals(entries.size(), 2); // tsato: more crude test;

Files.delete(indexOutput.toPath()); // tsato: temporary
}

// From https://github.com/samtools/htsjdk/issues/1084
@Test
public void testStdin() throws IOException {
final InputStream inputStream = Files.newInputStream(Paths.get("/dev/stdin/"));
inputStream.available();
}
}
Loading
Loading