From 8140531a090d6dbbca91f97ae9e8794cb5ecf66d Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Wed, 6 Mar 2019 11:06:00 -0500 Subject: [PATCH] Updating htsjdk, disq, and Picard. * Updating htsjdk 2.18.2 -> 2.19.0 * Updating disq 0.2.0 -> 0.3.0 * Disabling test that was relying on broken behavior that was fixed in htsjdk * Removing code that has migrated to htsjdk --- build.gradle | 6 +- .../spark/CreateHadoopBamSplittingIndex.java | 4 +- .../utils/SingleSequenceReferenceAligner.java | 9 +- .../walkers/fasta/FastaReferenceMaker.java | 8 +- .../walkers/genotyper/GenotypingEngine.java | 3 +- .../io/DeleteRecursivelyOnExitPathHook.java | 2 +- .../hellbender/utils/io/IOUtils.java | 13 +- .../utils/reference/FastaReferenceWriter.java | 743 ------------------ .../datasources/ReadsSparkSinkUnitTest.java | 2 +- .../testers/MarkDuplicatesSparkTester.java | 7 +- .../IndexFeatureFileIntegrationTest.java | 3 +- ...adoopBamSplittingIndexIntegrationTest.java | 2 +- .../FastaReferenceWriterUnitTest.java | 518 ------------ 13 files changed, 25 insertions(+), 1295 deletions(-) delete mode 100644 src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java delete mode 100644 src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java diff --git a/build.gradle b/build.gradle index e5c0b2d7d9a..2689b797d21 100644 --- a/build.gradle +++ b/build.gradle @@ -58,12 +58,12 @@ repositories { } final requiredJavaVersion = "8" -final htsjdkVersion = System.getProperty('htsjdk.version','2.18.2') -final picardVersion = System.getProperty('picard.version','2.18.25') +final htsjdkVersion = System.getProperty('htsjdk.version','2.19.0') +final picardVersion = System.getProperty('picard.version','2.19.0') final barclayVersion = System.getProperty('barclay.version','2.1.0') final sparkVersion = System.getProperty('spark.version', '2.2.0') final hadoopVersion = System.getProperty('hadoop.version', '2.8.2') -final disqVersion = System.getProperty('disq.version','0.2.0') +final disqVersion = System.getProperty('disq.version','0.3.0') final genomicsdbVersion = System.getProperty('genomicsdb.version','1.0.0-rc2') final testNGVersion = '6.11' // Using the shaded version to avoid conflicts between its protobuf dependency diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java index 3841f9ca313..5942b209d5f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java @@ -125,7 +125,7 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i assertBamIsCoordinateSorted(header); final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity); - final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAMIndexSuffix), header); + final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, BAMIndex.BAI_INDEX_SUFFIX), header); BAMFileSpan lastFilePointer = null; for(final SAMRecord read : reader){ BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer(); @@ -149,7 +149,7 @@ private static void createBaiAndSplittingIndex(final File inputBam, final File i private static void assertBamIsCoordinateSorted(final SAMFileHeader header) { if( header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - throw new UserException.BadInput("Cannot create a " + BAMIndex.BAMIndexSuffix + " index for a file " + + throw new UserException.BadInput("Cannot create a " + BAMIndex.BAI_INDEX_SUFFIX + " index for a file " + "that isn't coordinate sorted."); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java index 80804acd73b..be7336caee3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SingleSequenceReferenceAligner.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.spark.sv.utils; import htsjdk.samtools.SAMFlag; +import htsjdk.samtools.reference.FastaReferenceWriter; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval; @@ -8,16 +9,10 @@ import org.broadinstitute.hellbender.utils.bwa.BwaMemAligner; import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment; import org.broadinstitute.hellbender.utils.bwa.BwaMemIndex; -import org.broadinstitute.hellbender.utils.reference.FastaReferenceWriter; import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.function.BiFunction; import java.util.function.Function; import java.util.function.Predicate; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaReferenceMaker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaReferenceMaker.java index 2ce549a3791..82d1c22425b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaReferenceMaker.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaReferenceMaker.java @@ -1,6 +1,8 @@ package org.broadinstitute.hellbender.tools.walkers.fasta; import com.google.common.primitives.Bytes; +import htsjdk.samtools.reference.FastaReferenceWriter; +import htsjdk.samtools.reference.FastaReferenceWriterBuilder; import it.unimi.dsi.fastutil.bytes.ByteArrayList; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -13,7 +15,6 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.utils.reference.FastaReferenceWriter; import picard.cmdline.programgroups.ReferenceProgramGroup; import java.io.IOException; @@ -82,7 +83,10 @@ public class FastaReferenceMaker extends ReferenceWalker { public void onTraversalStart() { final Path path = IOUtils.getPath(output); try { - writer = new FastaReferenceWriter(path, basesPerLine, true, true); + writer = new FastaReferenceWriterBuilder() + .setFastaFile(path) + .setBasesPerLine(basesPerLine) + .build(); } catch (IOException e) { throw new UserException.CouldNotCreateOutputFile("Couldn't create " + output + ", encountered exception: " + e.getMessage(), e); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java index 05c5648b4a3..0eccc4fb789 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java @@ -384,8 +384,7 @@ private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalculationResult } else { // we want to keep the NON_REF symbolic allele but only in the absence of a non-symbolic allele, e.g. // if we combined a ref / NON_REF gVCF with a ref / alt gVCF - final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && allele.equals( - Allele.NON_REF_ALLELE); + final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && allele.equals(Allele.NON_REF_ALLELE); final boolean isPlausible = afCalculationResult.isPolymorphicPhredScaledQual(allele, configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING); siteIsMonomorphic &= !isPlausible; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/io/DeleteRecursivelyOnExitPathHook.java b/src/main/java/org/broadinstitute/hellbender/utils/io/DeleteRecursivelyOnExitPathHook.java index d1de3e5efa4..9a1080eebd1 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/io/DeleteRecursivelyOnExitPathHook.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/io/DeleteRecursivelyOnExitPathHook.java @@ -54,7 +54,7 @@ static void runHooks() { for (Path path : toBeDeleted) { try { IOUtils.deleteRecursively(path); - } catch (IOException | SecurityException e) { + } catch (SecurityException e) { // do nothing if cannot be deleted, because it is a shutdown hook } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/io/IOUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/io/IOUtils.java index e0be1bc7e71..c0d2b51606e 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/io/IOUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/io/IOUtils.java @@ -5,6 +5,7 @@ import htsjdk.samtools.BamFileIoUtils; import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.Tribble; import htsjdk.tribble.util.TabixUtils; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; @@ -702,7 +703,7 @@ public static Path createTempPath(String name, String extension) { final String filename = path.getFileName().toString(); IOUtils.deleteOnExit(path.resolveSibling(filename + Tribble.STANDARD_INDEX_EXTENSION)); IOUtils.deleteOnExit(path.resolveSibling(filename + TabixUtils.STANDARD_INDEX_EXTENSION)); - IOUtils.deleteOnExit(path.resolveSibling(filename + BAMIndex.BAMIndexSuffix)); + IOUtils.deleteOnExit(path.resolveSibling(filename + BAMIndex.BAI_INDEX_SUFFIX)); IOUtils.deleteOnExit(path.resolveSibling(filename.replaceAll(extension + "$", ".bai"))); IOUtils.deleteOnExit(path.resolveSibling(filename + ".md5")); @@ -1021,14 +1022,8 @@ public static void deleteOnExit(final Path fileToDelete){ * Delete rootPath recursively * @param rootPath is the file/directory to be deleted */ - public static void deleteRecursively(final Path rootPath) throws IOException { - final List pathsToDelete = Files.walk(rootPath) - .sorted(Comparator.reverseOrder()) - .collect(Collectors.toList()); - - for (Path path : pathsToDelete) { - Files.deleteIfExists(path); - } + public static void deleteRecursively(final Path rootPath) { + IOUtil.recursiveDelete(rootPath); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java b/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java deleted file mode 100644 index 146039c2d22..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriter.java +++ /dev/null @@ -1,743 +0,0 @@ -package org.broadinstitute.hellbender.utils.reference; - -import com.google.common.io.CountingOutputStream; -import htsjdk.samtools.SAMSequenceDictionaryCodec; -import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.reference.ReferenceSequenceFileFactory; -import org.apache.commons.io.output.NullWriter; -import org.broadinstitute.hellbender.utils.Nucleotide; -import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.param.ParamUtils; -import org.codehaus.plexus.util.StringUtils; - -import java.io.*; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.LinkedHashMap; -import java.util.Map; - -/** - * Writes a FASTA formatted reference file. - *

- * In addition it can also compose the index and dictionary files for the newly written reference file. - *

- *

- * Example: - * - * String[] seqNames = ...; - * byte[][] seqBases = ...; - * ... - * try (final FastaReferenceWriter writer = new FastaReferenceFileWriter(outputFile)) { - * for (int i = 0; i < seqNames.length; i++) { - * writer.startSequence(seqNames[i]).appendBases(seqBases[i]); - * } - * } - * - *

- *

- * The two main operations that one can invoke on a opened writer is {@link #startSequence} and {@link #appendBases}. - * The former indicates that we are going to append a new sequence to the output and is invoked once per sequence. - * The latter adds bases to the current sequence and can be called as many times as is needed. - *

- *

- * The writer will make sure that the output adheres to the FASTA reference sequence file format restrictions: - *

- *

- */ -public final class FastaReferenceWriter implements AutoCloseable { - - /** - * Default number of bases per line. - */ - public static final int DEFAULT_BASES_PER_LINE = 60; - - /** - * Sequence header start character. - */ - public static final char HEADER_START_CHAR = '>'; - - /** - * Character used to separate the sequence name and the description if any. - */ - public static final char HEADER_NAME_AND_DESCRIPTION_SEPARATOR = ' '; - - /** - * Charset used for all outputs; fixed to UTF-8. - */ - private static final Charset CHARSET = Charset.forName("UTF-8"); - - /** - * The line separator string. - */ - private static final char LINE_SEPARATOR_CHR = '\n'; - - /** - * Character used to separate the fields in a index file line. - */ - private static final char INDEX_FIELD_SEPARATOR_CHR = '\t'; - - /** - * Convenient cached {@code byte[]} representation of the line separator. - */ - private static final byte[] LINE_SEPARATOR = String.valueOf(LINE_SEPARATOR_CHR).getBytes(CHARSET); - - /** - * Output stream to the main FASTA output. - *

- * We use it also to count the number of bytes so far outputted thus the offset included in - * the index file entry. - *

- */ - private final CountingOutputStream fastaStream; - - /** - * Writer for the index file. - */ - private final Writer indexWriter; - - /** - * Output writer to the output dictionary. - */ - private final Writer dictWriter; - - /** - * Output codec for the dictionary. - */ - private final SAMSequenceDictionaryCodec dictCodec; - - /** - * Default number of bases per line to be applied unless one is - */ - private final int defaultBasePerLine; - - /** - * Records the sequences that have been already fully appended to this writer. - *

- * The key is the sequence name. - *

- *

- * The value is the sequence length in bases. - *

- */ - private final Map sequenceNamesAndSizes; - - /** - * Bases per line to be applied to the sequence that is been currently appended to the output. - */ - private int currentBasesPerLine; - - /** - * Holds the number of bases in the current output line. - */ - private int currentLineBasesCount; - - /** - * Holds the number of bases so far appended for the current sequence. - */ - private long currentBasesCount; - - /** - * Holds the FASTA output file offset for the current sequence. - */ - private long currentSequenceOffset; - - /** - * Holds the name of the sequence that is been appended currently. - */ - private String currentSequenceName; - - /** - * Flag indicating whether this writer has been already closed. - */ - private boolean closed; - - /** - * Creates a reference FASTA file writer. - *

- * The default bases-per-line is set to {@link #DEFAULT_BASES_PER_LINE}. - *

- *

- * Names for the fasta index and dictionary are constructed from the FASTA output file using common practices - * as resolved by {@link ReferenceSequenceFileFactory#getFastaIndexFileName(Path)} - * and {@link ReferenceSequenceFileFactory#getDefaultDictionaryForReferenceSequence(Path)} - * respectively. - *

- * - * @param fastaFile the output fasta file path. - * @param makeFaiOutput whether an index must be generated. - * @param makeDictOutput whether a dictionary must be generated. - * @throws IllegalArgumentException if {@code fastaFile} is {@code null}. - * @throws IOException if such exception is thrown when accessing the output path resources. - */ - public FastaReferenceWriter(final Path fastaFile, final boolean makeFaiOutput, final boolean makeDictOutput) - throws IOException - { - this(fastaFile, DEFAULT_BASES_PER_LINE, makeFaiOutput, makeDictOutput); - } - - /** - * Creates a reference FASTA file writer. - *

- * Names for the fasta index and dictionary are constructed from the FASTA output file using common practices - * as resolved by {@link ReferenceSequenceFileFactory#getFastaIndexFileName(Path)} - * and {@link ReferenceSequenceFileFactory#getDefaultDictionaryForReferenceSequence(Path)} - * respectively. - *

- * - * @param fastaFile the output fasta file path. - * @param basesPerLine default bases per line. - * @param makeFaiOutput whether an index must be generated. - * @param makeDictOutput whether a dictionary must be generated. - * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative. - * @throws IOException if such exception is thrown when accessing the output path resources. - */ - public FastaReferenceWriter(final Path fastaFile, final int basesPerLine, final boolean makeFaiOutput, - final boolean makeDictOutput) - throws IOException - { - this(Utils.nonNull(fastaFile, "the output fasta-file cannot be null"), - basesPerLine, - defaultFaiFile(makeFaiOutput, fastaFile), - defaultDictFile(makeDictOutput, fastaFile)); - } - - /** - * Creates a reference FASTA file writer. - *

- * The default bases-per-line is set to {@link #DEFAULT_BASES_PER_LINE}. - *

- *

- * You can specify a specific path for the index and dictionary file. If either set to {@code null} such - * a file won't be generated. - *

- * - * @param fastaFile the output fasta file path. - * @param indexFile the path of the index file, if requested, {@code null} if none should be generated. - * @param dictFile the path of the dictFile, if requested, {@code null} if nono should be generated. - * @throws IllegalArgumentException if {@code fastaFile} is {@code null}. - * @throws IOException if such exception is thrown when accessing the output path resources. - */ - public FastaReferenceWriter(final Path fastaFile, final Path indexFile, final Path dictFile) - throws IOException - { - this(fastaFile, DEFAULT_BASES_PER_LINE, indexFile, dictFile); - } - - /** - * Creates a reference FASTA file writer. - *

- * You can specify a specific path for the index and dictionary file. If either set to {@code null} such - * a file won't be generated. - *

- * - * @param fastaFile the output fasta file path. - * @param indexFile the path of the index file, if requested, {@code null} if none should be generated. - * @param dictFile the path of the dictFile, if requested, {@code null} if nono should be generated. - * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative. - * @throws IOException if such exception is thrown when accessing the output path resources. - */ - public FastaReferenceWriter(final Path fastaFile, final int basesPerLine, final Path indexFile, final Path dictFile) - throws IOException - { - // This code is a slight repeat of {@link #FastaReferenceWriter(OutputStream,int,OutputStream,OutputStream) - // for the sake of avoiding creating output if basesPerLine is invalid. - this.defaultBasePerLine = checkBasesPerLine(basesPerLine); - - this.fastaStream = new CountingOutputStream(new BufferedOutputStream(Files.newOutputStream(Utils.nonNull(fastaFile)))); - this.indexWriter = indexFile == null ? new NullWriter() : new OutputStreamWriter(Files.newOutputStream(indexFile), CHARSET); - final BufferedWriter dictWriter = new BufferedWriter(dictFile == null ? new NullWriter() : new OutputStreamWriter(Files.newOutputStream(dictFile), CHARSET)); - this.dictWriter = dictWriter; - this.dictCodec = new SAMSequenceDictionaryCodec(dictWriter); - this.dictCodec.encodeHeaderLine(false); - this.sequenceNamesAndSizes = new LinkedHashMap<>(); - } - - /** - * Creates a reference FASTA file writer. - *

- * You can specify a specific output stream to each file: the main fasta output, its index and its dictionary. - *

- * - * @param fastaOutput the output fasta file path. - * @param indexOutput the output stream to the index file, if requested, {@code null} if none should be generated. - * @param dictOutput the output stream to the dictFile, if requested, {@code null} if none should be generated. - * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative. - */ - public FastaReferenceWriter(final OutputStream fastaOutput, - final int basesPerLine, - final OutputStream indexOutput, - final OutputStream dictOutput) { - this.defaultBasePerLine = checkBasesPerLine(basesPerLine); - this.fastaStream = new CountingOutputStream(Utils.nonNull(fastaOutput)); - this.indexWriter = indexOutput == null ? new NullWriter() : new OutputStreamWriter(indexOutput, CHARSET); - final BufferedWriter dictWriter = new BufferedWriter(dictOutput == null ? new NullWriter() : new OutputStreamWriter(dictOutput, CHARSET)); - this.dictWriter = dictWriter; - this.dictCodec = new SAMSequenceDictionaryCodec(dictWriter); - this.dictCodec.encodeHeaderLine(false); - this.sequenceNamesAndSizes = new LinkedHashMap<>(); - } - - private static Path defaultFaiFile(final boolean makeFaiFile, final Path fastaFile) { - return makeFaiFile ? ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) : null; - } - - private static Path defaultDictFile(final boolean makeDictFile, final Path fastaFile) { - return makeDictFile ? ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaFile) : null; - } - - // checks that a sequence name is valid. - private static void checkSequenceName(final String name) { - Utils.nonNull(name, "the sequence name cannot be null"); - Utils.validateArg(!name.isEmpty(), "the input sequence name cannot be null"); - for (int i = 0; i < name.length(); i++) { - final char ch = name.charAt(i); - if (Character.isWhitespace(ch)) { - throw new IllegalArgumentException("the input name contains blank characters: '" + StringUtils.escape(name) + "'"); - } else if (Character.isISOControl(ch)) { - throw new IllegalArgumentException("the input name contains control characters: '" + StringUtils.escape(name) + "'"); - } - } - } - - private static void checkSequenceBases(final byte[] bases, final int offset, final int length) { - Utils.nonNull(bases, "the input bases array cannot be null"); - final int to = offset + length; - for (int i = offset; i < to; i++) { - final byte b = bases[i]; - if (!Nucleotide.decode(b).isValid()) { - throw new IllegalArgumentException( "the input sequence contains invalid base calls like: " - + StringUtils.escape(""+ (char) b)); - } - } - } - - private static String checkDescription(final String description) { - if (description == null || description.isEmpty()) { - return ""; - } else { - for (int i = 0; i < description.length(); i++) { - final char c = description.charAt(i); - if (Character.isISOControl(c) && c != '\t') { // tab is the only valid control char in the description. - throw new IllegalArgumentException("the input name contains non-tab control characters: '" - + StringUtils.escape(description) + "'"); - } - } - return description; - } - } - - private static int checkBasesPerLine(final int value) { - return ParamUtils.isPositive(value, "base per line must be 1 or greater"); - } - - /** - * Starts the input of the bases of a new sequence. - *

- * This operation automatically closes the previous sequence base input if any. - *

- *

- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), - * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character - * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). - *

- *

- * No description is included in the output. - *

- *

- * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE} - * if none was provided. - *

- *

- * This method cannot be called after the writer has been closed. - *

- *

- * It also will fail if no base was added to the previous sequence if any. - *

- * @param sequenceName the name of the new sequence. - * @return this instance. - * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence - * with the same name has already been added to the writer. - * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. - * @throws IOException if such exception is thrown when writing into the output resources. - */ - public FastaReferenceWriter startSequence(final String sequenceName) - throws IOException - { - return startSequence(sequenceName, "", defaultBasePerLine); - } - - /** - * Starts the input of the bases of a new sequence. - *

- * This operation automatically closes the previous sequence base input if any. - *

- *

- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), - * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character - * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). - *

- *

- * The input bases-per-line must be 1 or greater. - *

- *

- * This method cannot be called after the writer has been closed. - *

- *

- * It also will fail if no base was added to the previous sequence if any. - *

- * @param sequenceName the name of the new sequence. - * @param basesPerLine number of bases per line for this sequence. - * @return this instance. - * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence - * with the same name has already been added to the writer. - * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. - * @throws IOException if such exception is thrown when writing into the output resources. - */ - public FastaReferenceWriter startSequence(final String sequenceName, final int basesPerLine) - throws IOException - { - return startSequence(sequenceName, "", checkBasesPerLine(basesPerLine)); - } - - /** - * Starts the input of the bases of a new sequence. - *

- * This operation automatically closes the previous sequence base input if any. - *

- *

- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), - * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character - * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). - *

- *

- * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty - * string ("") no description will be outputted. - *

- *

- * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE} - * if none was provided. - *

- *

- * This method cannot be called after the writer has been closed. - *

- *

- * It also will fail if no base was added to the previous sequence if any. - *

- * @param sequenceName the name of the new sequence. - * @param description optional description for that sequence. - * @return this instance. - * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence - * with the same name has already been added to the writer. - * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. - * @throws IOException if such exception is thrown when writing into the output resources. - */ - public FastaReferenceWriter startSequence(final String sequenceName, final String description) - throws IOException - { - return startSequence(sequenceName, description, defaultBasePerLine); - } - - /** - * Starts the input of the bases of a new sequence. - *

- * This operation automatically closes the previous sequence base input if any. - *

- *

- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), - * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character - * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). - *

- *

- * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty - * string ("") no description will be outputted. - *

- *

- * The input bases-per-line must be 1 or greater. - *

- *

- * This method cannot be called after the writer has been closed. - *

- *

- * It also will fail if no base was added to the previous sequence if any. - *

- * @param sequenceName the name of the new sequence. - * @param description optional description for that sequence. - * @param basesPerLine number of bases per line for this sequence. - * @return this instance. - * @throws IllegalArgumentException if any argument does not comply with requirements listed above. - * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed of - * the sequence has been already added. - * @throws IOException if such exception is thrown when writing into the output resources. - */ - public FastaReferenceWriter startSequence(final String sequenceName, final String description, final int basesPerLine) - throws IOException - { - assertIsNotClosed(); - checkSequenceName(sequenceName); - final String nonNullDescription = checkDescription(description); - checkBasesPerLine(basesPerLine); - closeSequence(); - if (sequenceNamesAndSizes.containsKey(sequenceName)) { - throw new IllegalStateException("the input sequence name '" + sequenceName + "' has already been added"); - } - currentSequenceName = sequenceName; - currentBasesPerLine = basesPerLine; - final StringBuilder builder = new StringBuilder(sequenceName.length() + nonNullDescription.length() + 10); - builder.append(HEADER_START_CHAR).append(sequenceName); - if (!nonNullDescription.isEmpty()) { - builder.append(HEADER_NAME_AND_DESCRIPTION_SEPARATOR).append(nonNullDescription); - } - fastaStream.write(builder.toString().getBytes(CHARSET)); - fastaStream.write(LINE_SEPARATOR); - currentSequenceOffset = fastaStream.getCount(); - return this; - } - - private void closeSequence() - throws IOException - { - if (currentSequenceName != null) { - if (currentBasesCount == 0) { - throw new IllegalStateException("startSequence was called but no base was added"); - } - sequenceNamesAndSizes.put(currentSequenceName, currentBasesCount); - writeIndexEntry(); - writeDictEntry(); - fastaStream.write(LINE_SEPARATOR); - currentBasesCount = 0; - currentLineBasesCount = 0; - currentSequenceName = null; - } - } - - private void writeIndexEntry() - throws IOException - { - indexWriter.append(currentSequenceName).append(INDEX_FIELD_SEPARATOR_CHR) - .append(String.valueOf(currentBasesCount)).append(INDEX_FIELD_SEPARATOR_CHR) - .append(String.valueOf(currentSequenceOffset)).append(INDEX_FIELD_SEPARATOR_CHR) - .append(String.valueOf(currentBasesPerLine)).append(INDEX_FIELD_SEPARATOR_CHR) - .append(String.valueOf(currentBasesPerLine + LINE_SEPARATOR.length)).append(LINE_SEPARATOR_CHR); - } - - private void writeDictEntry() { - dictCodec.encodeSequenceRecord(new SAMSequenceRecord(currentSequenceName, (int) currentBasesCount)); - } - - /** - * Adds bases to current sequence from a {@code byte} array. - * - * @param bases array containing the bases to be added. - * @return this instance. - * @throws IllegalArgumentException if {@bases} is {@code null} or - * the input array contains invalid bases (as assessed by: {@link Nucleotide#decode(byte)}). - * @throws IllegalStateException if no sequence was started or the writer is already closed. - * @throws IOException if such exception is throw when writing in any of the outputs. - */ - public FastaReferenceWriter appendBases(final byte[] bases) - throws IOException - { - return appendBases(bases, 0, bases.length); - } - - /** - * Adds bases to current sequence from a range in a {@code byte} array. - * - * @param bases array containing the bases to be added. - * @param offset the position of the first base to add. - * @param length how many bases to be added starting from position {@code offset}. - * @return this instance. - * @throws IllegalArgumentException if {@bases} is {@code null} or - * {@code offset} and {@code length} do not entail a valid range in {@code bases} or - * that range in {@base} contain invalid bases (as assessed by: {@link Nucleotide#decode(byte)}). - * @throws IllegalStateException if no sequence was started or the writer is already closed. - * @throws IOException if such exception is throw when writing in any of the outputs. - */ - public FastaReferenceWriter appendBases(final byte[] bases, final int offset, final int length) - throws IOException - { - assertIsNotClosed(); - assertSequenceOpen(); - checkSequenceBases(bases, offset, length); - ParamUtils.isPositiveOrZero(offset, "the input offset cannot be negative"); - ParamUtils.isPositiveOrZero(length, "the input length must not be negative"); - final int to = offset + length; - Utils.validateArg(to <= bases.length, "the length + offset goes beyond the end of " + - "the input base array: '" + to + "' > '" + bases.length + "'"); - - int next = offset; - while (next < to) { - if (currentLineBasesCount == currentBasesPerLine) { - fastaStream.write(LINE_SEPARATOR); - currentLineBasesCount = 0; - } - final int nextLength = Math.min(to - next, currentBasesPerLine - currentLineBasesCount); - fastaStream.write(bases, next, nextLength); - currentLineBasesCount += nextLength; - next += nextLength; - } - currentBasesCount += length; - return this; - } - - /** - * Appends a new sequence to the output. - *

- * This is a convenient short handle for {@code startSequence(name).appendBases(bases)}. - *

- *

- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. - *

- * @param name the name of the new sequence. - * @param bases the (first) bases of the sequence. - * @return a reference to this very same writer. - * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. - * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). - * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence - * with such name was already appended to this writer. - */ - public FastaReferenceWriter appendSequence(final String name, final byte[] bases) throws IOException { - return startSequence(name).appendBases(bases); - } - - /** - * Appends a new sequence to the output with or without a description. - *

- * This is a convenient short handle for {@code startSequence(name, description).appendBases(bases)}. - *

- *

- * A {@code null} or empty ("") description will be ignored (no description will be output). - *

- *

- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. - *

- * @param name the name of the new sequence. - * @param bases the (first) bases of the sequence. - * @param description the description for the new sequence. - * @return a reference to this very same writer. - * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. - * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when - * the {@code description} contains unsupported characters. - * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence - * with such name was already appended to this writer. - */ - public FastaReferenceWriter appendSequence(final String name, final String description, final byte[] bases) throws IOException { - return startSequence(name, description).appendBases(bases); - } - - /** - * Appends a new sequence to the output with or without a description and an alternative number of bases-per-line. - *

- * This is a convenient short handle for {@code startSequence(name, description, bpl).appendBases(bases)}. - *

- *

- * A {@code null} or empty ("") description will be ignored (no description will be output). - *

- *

- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. - *

- * @param name the name of the new sequence. - * @param bases the (first) bases of the sequence. - * @param description the description for the sequence. - * @param basesPerLine alternative number of bases per line to be used for the sequence. - * @return a reference to this very same writer. - * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. - * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when the - * {@code description} contains unsupported characters or {@code basesPerLine} is 0 or negative. - * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence - * with such name was already appended to this writer. - */ - public FastaReferenceWriter appendSequence(final String name, final String description, final int basesPerLine, final byte[] bases) throws IOException { - return startSequence(name, description, basesPerLine).appendBases(bases); - } - - private void assertSequenceOpen() { - if (currentSequenceName == null) { - throw new IllegalStateException("trying to add bases without starting a sequence"); - } - } - - private void assertIsNotClosed() { - if (closed) { - throw new IllegalStateException("already closed"); - } - } - - /** - * Closes this writer flushing all remaining writing operation input the output resources. - *

- * Further calls to {@link #appendBases} or {@link #startSequence} will result in an exception. - *

- * - * @throws IOException if such exception is thrown when closing output writers and output streams. - * @throws IllegalStateException if closing without writing any sequences or closing when writing a sequence is in progress - */ - public void close() throws IOException - { - if (!closed) { - try { - closeSequence(); - if (sequenceNamesAndSizes.isEmpty()) { - throw new IllegalStateException("no sequences where added to the reference"); - } - } finally { - closed = true; - fastaStream.close(); - indexWriter.close(); - dictWriter.close(); - } - } - } - - /** - * Convenient method to write a FASTA file with a single sequence. - * - * @param whereTo the path to. must not be null. - * @param makeIndex whether the index file should be written at its standard location. - * @param makeDict whether the dictionary file should be written at it standard location. - * @param name the sequence name, cannot contain white space, or control chracter or the header start character. - * @param description the sequence description, "" if no description. - * @param bases the sequence bases, cannot be {@code null}. - * @throws IOException if such exception is thrown when writing in the output resources. - */ - public static void writeSingleSequenceReference(final Path whereTo, final boolean makeIndex, - final boolean makeDict, final String name, - final String description, final byte[] bases) - throws IOException - { - try (final FastaReferenceWriter writer = new FastaReferenceWriter(whereTo, makeIndex, makeDict)) { - writer.startSequence(name, description); - writer.appendBases(bases); - } - } - - /** - * Convenient method to write a FASTA file with a single sequence. - * - * @param whereTo the path to. must not be null. - * @param basesPerLine number of bases per line. must be 1 or greater. - * @param makeIndex whether the index file should be written at its standard location. - * @param makeDict whether the dictionary file should be written at it standard location. - * @param name the sequence name, cannot contain white space, or control chracter or the header start character. - * @param description the sequence description, "" if no description. - * @param bases the sequence bases, cannot be {@code null}. - * @throws IOException if such exception is thrown when writing in the output resources. - */ - public static void writeSingleSequenceReference(final Path whereTo, final int basesPerLine, final boolean makeIndex, - final boolean makeDict, final String name, - final String description, final byte[] bases) - throws IOException - { - try (final FastaReferenceWriter writer = new FastaReferenceWriter(whereTo, basesPerLine, makeIndex, makeDict)) { - writer.startSequence(name, description); - writer.appendBases(bases); - } - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java index 4dd8958a300..53981307ed3 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java @@ -178,7 +178,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi // check that a bai file is created if (IOUtils.isBamFileName(outputPath) && writeBai) { - Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + BAMIndex.BAMIndexSuffix))); + Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + BAMIndex.BAI_INDEX_SUFFIX))); } // check that a splitting bai file is created if (IOUtils.isBamFileName(outputPath) && writeSbi) { diff --git a/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java index 3cc4848274a..36aad372f21 100644 --- a/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java +++ b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java @@ -3,10 +3,7 @@ import htsjdk.samtools.*; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.FormatUtil; -import htsjdk.samtools.util.TestUtil; +import htsjdk.samtools.util.*; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; @@ -158,7 +155,7 @@ public void test() { Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected"); Assert.assertEquals(observedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, expectedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, "SECONDARY_OR_SUPPLEMENTARY_RDS does not match expected"); } finally { - TestUtil.recursiveDelete(getOutputDir()); + IOUtil.recursiveDelete(getOutputDir().toPath()); } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java index 09c6132250d..25a6f11fd74 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java @@ -209,7 +209,8 @@ public void testBCFIndex() { checkIndex(index, Arrays.asList("1")); } - @Test(expectedExceptions = UserException.CouldNotIndexFile.class) + // test disabled until https://github.com/samtools/htsjdk/issues/1323 is resolved + @Test(enabled = false) public void testUncompressedBCF2_2Index() { final File ORIG_FILE = getTestFile("test_variants_for_index.BCF22uncompressed.bcf"); final File outName = createTempFile("test_variants_for_index.BCF22uncompressed.bcf", ".idx"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java index 71bfff76927..eb00a6d8838 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java @@ -114,7 +114,7 @@ public ArgumentsBuilder getInputAndOutputArgs(final File inputFile, final File s @Test public void testCreateWithBaiCreatesBai(){ final File splittingIndex = getTempIndexFile(); - final File baiIndex = IOUtils.replaceExtension(splittingIndex, BAMIndex.BAMIndexSuffix); + final File baiIndex = IOUtils.replaceExtension(splittingIndex, BAMIndex.BAI_INDEX_SUFFIX); Assert.assertFalse(baiIndex.exists()); final ArgumentsBuilder args = getInputAndOutputArgs(SORTED_BAM, splittingIndex) .add("--" + CreateHadoopBamSplittingIndex.CREATE_BAI_LONG_NAME); diff --git a/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java deleted file mode 100644 index ffe7f7c30cc..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java +++ /dev/null @@ -1,518 +0,0 @@ -package org.broadinstitute.hellbender.utils.reference; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.reference.FastaSequenceIndex; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.samtools.reference.ReferenceSequenceFileFactory; -import htsjdk.samtools.util.SequenceUtil; -import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.engine.ReadsDataSource; -import org.broadinstitute.hellbender.utils.RandomDNA; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.*; -import java.net.URISyntaxException; -import java.nio.file.Path; -import java.security.GeneralSecurityException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; - -/** - * Unit tests for {@link FastaReferenceWriter}. - */ -public class FastaReferenceWriterUnitTest extends GATKBaseTest { - - public static void assertOutput(final Path path, final boolean mustHaveIndex, final boolean mustHaveDictionary, - final boolean withDescriptions, final SAMSequenceDictionary dictionary, final int defaultBpl, - final Map bases, final Map basesPerLine) - throws IOException { - assertFastaContent(path, withDescriptions, dictionary, defaultBpl, bases, basesPerLine); - if (mustHaveDictionary) { - assertFastaDictionaryContent(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(path), dictionary); - } - if (mustHaveIndex) { - assertFastaIndexContent(path, ReferenceSequenceFileFactory.getFastaIndexFileName(path), dictionary, bases); - } - } - - public static void assertFastaContent(final Path path, final boolean withDescriptions, final SAMSequenceDictionary dictionary, final int defaultBpl, - final Map bases, final Map basesPerLine) - throws IOException { - try (final BufferedReader reader = new BufferedReader(new InputStreamReader(path.getFileSystem().provider().newInputStream(path)))) { - for (final SAMSequenceRecord sequence : dictionary.getSequences()) { - final String description = String.format("index=%d\tlength=%d", - dictionary.getSequenceIndex(sequence.getSequenceName()), sequence.getSequenceLength()); - final String expectedHeader = FastaReferenceWriter.HEADER_START_CHAR + sequence.getSequenceName() - + ((withDescriptions) ? FastaReferenceWriter.HEADER_NAME_AND_DESCRIPTION_SEPARATOR + description : ""); - Assert.assertTrue(reader.readLine().startsWith(expectedHeader)); - final byte[] expectedBases = bases.get(sequence.getSequenceName()); - final int bpl_ = basesPerLine.get(sequence.getSequenceName()); - final int bpl = bpl_ < 0 ? (defaultBpl < 0 ? FastaReferenceWriter.DEFAULT_BASES_PER_LINE : defaultBpl) : bpl_; - int offset = 0; - while (offset < expectedBases.length) { - final int expectedLength = Math.min(expectedBases.length - offset, bpl); - final byte[] expectedBaseLine = SequenceUtil.upperCase( - Arrays.copyOfRange(expectedBases, offset, offset + expectedLength)); - final byte[] actualBaseLine = SequenceUtil.upperCase(reader.readLine().getBytes()); - Assert.assertEquals(actualBaseLine, expectedBaseLine); - offset += expectedLength; - } - } - } - } - - public static void assertFastaIndexContent(final Path path, final Path indexPath, final SAMSequenceDictionary dictionary, - final Map bases) { - final FastaSequenceIndex index = new FastaSequenceIndex(indexPath); - final IndexedFastaSequenceFile indexedFasta = new IndexedFastaSequenceFile(path, index); - for (final SAMSequenceRecord sequence : dictionary.getSequences()) { - final String name = sequence.getSequenceName(); - final int length = sequence.getSequenceLength(); - final ReferenceSequence start = indexedFasta.getSubsequenceAt(name, 1, Math.min(length, 30)); - final ReferenceSequence end = indexedFasta.getSubsequenceAt(name, Math.max(1, length - 29), length); - final int middlePos = Math.max(1, Math.min(length, length / 2)); - final ReferenceSequence middle = indexedFasta.getSubsequenceAt(name, middlePos, Math.min(middlePos + 29, length)); - Assert.assertEquals(start.getBases(), Arrays.copyOfRange(bases.get(name), 0, start.length())); - Assert.assertEquals(end.getBases(), Arrays.copyOfRange(bases.get(name), Math.max(0, length - 30), length)); - Assert.assertEquals(middle.getBases(), Arrays.copyOfRange(bases.get(name), middlePos - 1, middlePos - 1 + middle.length())); - } - } - - public static void assertFastaDictionaryContent(final Path dictPath, final SAMSequenceDictionary dictionary) { - final ReadsDataSource readsDataSource = new ReadsDataSource(dictPath); - final SAMFileHeader actualHeader = readsDataSource.getHeader(); - final SAMSequenceDictionary actualDictionary = actualHeader.getSequenceDictionary(); - dictionary.assertSameDictionary(actualDictionary); - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testEmptySequence() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1"); - writer.appendBases(new RandomDNA(113).nextBases(100)); - writer.startSequence("seq2"); - writer.startSequence("seq3"); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testEmptyReference() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try { - new FastaReferenceWriter(testOutput.toPath(), false, false).close(); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testStartSequenceAfterClose() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false); - writer.startSequence("seq1").appendBases(new byte[]{'A', 'C', 'G', 'T'}); - writer.close(); - try { - writer.startSequence("seq2"); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testAddBasesAfterClose() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false); - writer.startSequence("seq1").appendBases(new byte[]{'A', 'C', 'G', 'T'}); - writer.close(); - try { - writer.appendBases(new byte[]{'A', 'A', 'A'}); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(dataProvider = "invalidBplData", expectedExceptions = IllegalArgumentException.class) - public void testBadDefaultBasesPerLine(final int invalidBpl) throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - final File testIndexOutput = ReferenceSequenceFileFactory.getFastaIndexFileName(testOutput.toPath()).toFile(); - final File testDictOutput = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutput); - Assert.assertTrue(testOutput.delete()); - try { - new FastaReferenceWriter(testOutput.toPath(), invalidBpl, true, true); - } finally { - // make sure that no output file was created: - Assert.assertFalse(testOutput.delete()); - Assert.assertFalse(testIndexOutput.delete()); - Assert.assertFalse(testDictOutput.delete()); - } - } - - @Test(dataProvider = "invalidBplData", expectedExceptions = IllegalArgumentException.class) - public void testBadSequenceBasesPerLine(final int invalidBpl) throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1", invalidBpl); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testEmptySequenceAtTheEnd() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1"); - writer.appendBases(new RandomDNA(113).nextBases(100)); - writer.startSequence("seq2"); - writer.appendBases(new RandomDNA(13).nextBases(1001)); - writer.startSequence("seq3"); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testAppendBasesBeforeStartingSequence() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.appendBases(new RandomDNA(113).nextBases(100)); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testAddingSameSequenceTwice() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1"); - writer.appendBases(new RandomDNA(113).nextBases(100)); - writer.startSequence("seq2"); - writer.appendBases(new RandomDNA(114).nextBases(300)); - writer.startSequence("seq1"); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalStateException.class) - public void testAddingSameSequenceRightAfter() throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1"); - writer.appendBases(new RandomDNA(113).nextBases(100)); - writer.startSequence("seq1"); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class, dataProvider = "invalidNameData") - public void testAddingInvalidSequenceName(final String invalidName) throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence(invalidName); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class, dataProvider = "invalidDescriptionData") - public void testAddingInvalidDescription(final String invalidDescription) throws IOException { - final File testOutput = createTempFile("fwr-test", ".fasta"); - Assert.assertTrue(testOutput.delete()); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) { - writer.startSequence("seq1", invalidDescription); - } finally { - Assert.assertTrue(testOutput.delete()); - } - } - - @DataProvider(name = "invalidBplData") - public Object[][] invalidBplData() { - return IntStream.of(0, -1, -110) - .mapToObj(i -> new Object[]{i}).toArray(Object[][]::new); - } - - @DataProvider(name = "invalidNameData") - public Object[][] invalidNameData() { - return Stream.of("seq with spaces", "seq\twith\ttabs", "with blank", " ", "", "nnn\n", "rrr\r", null) - .map(s -> new Object[]{s}).toArray(Object[][]::new); - } - - @DataProvider(name = "invalidDescriptionData") - public Object[][] invalidDescriptionData() { - return Stream.of("\nwith control chars\nthat are not\0tabs\r", "with the null\0", "with nl\n") - .map(s -> new Object[]{s}).toArray(Object[][]::new); - } - - @Test(dataProvider = "testData") - public void testWriter(final SAMSequenceDictionary dictionary, final boolean withIndex, final boolean withDictionary, - final boolean withDescriptions, final int defaultBpl, - final int minBpl, final int maxBpl, final int seed) - throws IOException { - final Map bases = new LinkedHashMap<>(dictionary.getSequences().size()); - final Map bpl = new LinkedHashMap<>(dictionary.getSequences().size()); - final Random rdn = new Random(seed); - generateRandomBasesAndBpls(dictionary, minBpl, maxBpl, bases, bpl, rdn); - final File fastaFile = createTempFile("fwr-test", ".fa"); - Assert.assertTrue(fastaFile.delete()); - final File fastaIndexFile = new File(fastaFile.getParentFile(), fastaFile.getName() + ".fai"); - final File dictFile = new File(fastaFile.getParentFile(), fastaFile.getName().replaceAll("\\.fa", ".dict")); - fastaIndexFile.deleteOnExit(); - dictFile.deleteOnExit(); - - try (final FastaReferenceWriter writer = defaultBpl < 0 - ? new FastaReferenceWriter(fastaFile.toPath(), withIndex, withDictionary) - : new FastaReferenceWriter(fastaFile.toPath(), defaultBpl, withIndex, withDictionary)) { - writeReference(writer, withDescriptions, rdn, dictionary, bases, bpl); - } - assertOutput(fastaFile.toPath(), withIndex, withDictionary, withDescriptions, dictionary, defaultBpl, bases, bpl); - Assert.assertTrue(fastaFile.delete()); - Assert.assertEquals(fastaIndexFile.delete(), withIndex); - Assert.assertEquals(dictFile.delete(), withDictionary); - } - - @Test - public void testSingleSequenceStaticWithBpl() throws IOException, GeneralSecurityException, URISyntaxException { - final File testOutputFile = createTempFile("fwr-test", ".random0.fasta"); - final Map seqs = Collections.singletonMap("seqA", new RandomDNA(1341).nextBases(100)); - final Map bpls = Collections.singletonMap("seqA", 42); - final SAMSequenceDictionary dictionary = new SAMSequenceDictionary( - Collections.singletonList(new SAMSequenceRecord("seqA", 100)) - ); - FastaReferenceWriter.writeSingleSequenceReference(testOutputFile.toPath(), 42, - true, true, "seqA", null, seqs.get("seqA")); - assertOutput(testOutputFile.toPath(), true, true, false, dictionary, 42, seqs, bpls); - Assert.assertTrue(testOutputFile.delete()); - Assert.assertTrue(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutputFile).delete()); - Assert.assertTrue(ReferenceSequenceFileFactory.getFastaIndexFileName(testOutputFile.toPath()).toFile().delete()); - } - - @Test - public void testSingleSequenceStatic() throws IOException, GeneralSecurityException, URISyntaxException { - final File testOutputFile = createTempFile("fwr-test", ".random0.fasta"); - final Map seqs = Collections.singletonMap("seqA", new RandomDNA(1341).nextBases(100)); - final Map bpls = Collections.singletonMap("seqA", FastaReferenceWriter.DEFAULT_BASES_PER_LINE); - final SAMSequenceDictionary dictionary = new SAMSequenceDictionary( - Collections.singletonList(new SAMSequenceRecord("seqA", 100)) - ); - FastaReferenceWriter.writeSingleSequenceReference(testOutputFile.toPath(), - true, true, "seqA", null, seqs.get("seqA")); - assertOutput(testOutputFile.toPath(), true, true, false, dictionary, FastaReferenceWriter.DEFAULT_BASES_PER_LINE, seqs, bpls); - Assert.assertTrue(testOutputFile.delete()); - Assert.assertTrue(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutputFile).delete()); - Assert.assertTrue(ReferenceSequenceFileFactory.getFastaIndexFileName(testOutputFile.toPath()).toFile().delete()); - } - - @Test - public void testAlternativeIndexAndDictFileNames() throws IOException, GeneralSecurityException, URISyntaxException { - final File testOutputFile = createTempFile("fwr-test", ".random0.fasta"); - final File testIndexOutputFile = createTempFile("fwr-test", ".random1.fai"); - final File testDictOutputFile = createTempFile("fwr-test", ".random2.dict"); - final SAMSequenceDictionary testDictionary = new SAMSequenceDictionary( - Collections.singletonList(new SAMSequenceRecord("seq1", 100)) - ); - final Map seqs = Collections.singletonMap("seq1", new RandomDNA(1341).nextBases(100)); - final Map bpls = Collections.singletonMap("seq1", -1); - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictOutputFile.toPath())) { - writer.startSequence("seq1"); - writer.appendBases(seqs.get("seq1")); - } - assertFastaContent(testOutputFile.toPath(), false, testDictionary, -1, seqs, bpls); - assertFastaIndexContent(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictionary, seqs); - assertFastaDictionaryContent(testDictOutputFile.toPath(), testDictionary); - Assert.assertTrue(testOutputFile.delete()); - Assert.assertTrue(testIndexOutputFile.delete()); - Assert.assertTrue(testDictOutputFile.delete()); - } - - @Test - public void testDirectOutputStreams() throws IOException, GeneralSecurityException, URISyntaxException { - final File testOutputFile = createTempFile("fwr-test", ".random0.fasta"); - final File testIndexOutputFile = createTempFile("fwr-test", ".random1.fai"); - final File testDictOutputFile = createTempFile("fwr-test", ".random2.dict"); - final SAMSequenceDictionary testDictionary = new SAMSequenceDictionary( - Collections.singletonList(new SAMSequenceRecord("seq1", 100)) - ); - final Map seqs = Collections.singletonMap("seq1", new RandomDNA(1341).nextBases(100)); - final Map bpls = Collections.singletonMap("seq1", -1); - try (final OutputStream testOutputStream = new FileOutputStream(testOutputFile); - final OutputStream testIndexOutputStream = new FileOutputStream(testIndexOutputFile); - final OutputStream testDictOutputStream = new FileOutputStream(testDictOutputFile)) { - try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutputStream, 50, testIndexOutputStream, testDictOutputStream)) { - writer.startSequence("seq1"); - writer.appendBases(seqs.get("seq1")); - } - } - assertFastaContent(testOutputFile.toPath(), false, testDictionary, 50, seqs, bpls); - assertFastaIndexContent(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictionary, seqs); - assertFastaDictionaryContent(testDictOutputFile.toPath(), testDictionary); - Assert.assertTrue(testOutputFile.delete()); - Assert.assertTrue(testIndexOutputFile.delete()); - Assert.assertTrue(testDictOutputFile.delete()); - } - - private void generateRandomBasesAndBpls(SAMSequenceDictionary dictionary, int minBpl, int maxBpl, Map bases, Map bpl, Random rdn) { - final RandomDNA rdnDNA = new RandomDNA(rdn.nextLong()); - // We avoid to use the obvious first choice {@link RandomDNA#nextFasta} as these may actually use - // this writer to do its job eventually. - for (final SAMSequenceRecord sequence : dictionary.getSequences()) { - bases.put(sequence.getSequenceName(), rdnDNA.nextBases(sequence.getSequenceLength())); - if (rdn.nextDouble() < 0.333333) { // 1/3 of the time we will use the default. - bpl.put(sequence.getSequenceName(), -1); - } else { - bpl.put(sequence.getSequenceName(), rdn.nextInt(maxBpl - minBpl + 1) + minBpl); - } - } - } - - - - private static void writeReference(final FastaReferenceWriter writer, final boolean withDescriptions, - final Random rdn, final SAMSequenceDictionary dictionary, - final Map seqs, - final Map basesPerLine) - throws IOException { - for (final SAMSequenceRecord sequence : dictionary.getSequences()) { - final int bpl = basesPerLine.get(sequence.getSequenceName()); - final boolean onOneGo = rdn.nextDouble() < 0.25; // 25% of times we just write the whole sequence of one go. - final boolean useAppendSequence = onOneGo && rdn.nextBoolean(); - if (withDescriptions) { - final String description = String.format("index=%d\tlength=%d", - dictionary.getSequenceIndex(sequence.getSequenceName()), - sequence.getSequenceLength()); - if (bpl < 0) { - if (useAppendSequence) { - Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), description, seqs.get(sequence.getSequenceName())), writer); - } else { - Assert.assertSame(writer.startSequence(sequence.getSequenceName(), description), writer); - } - } else { - if (useAppendSequence) { - Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), description, bpl, seqs.get(sequence.getSequenceName())), writer); - } else { - Assert.assertSame(writer.startSequence(sequence.getSequenceName(), description, bpl), writer); - } - } - } else { - if (bpl < 0) { - if (useAppendSequence) { - Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), seqs.get(sequence.getSequenceName())), writer); - } else { - Assert.assertSame(writer.startSequence(sequence.getSequenceName()), writer); - } - } else { - if (useAppendSequence) { - Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), null, bpl, seqs.get(sequence.getSequenceName())), writer); - } else { - Assert.assertSame(writer.startSequence(sequence.getSequenceName(), bpl), writer); - } - } - } - if (useAppendSequence) { - // added already. - } else if (onOneGo) { - Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName())), writer); - } else { - int done = 0; - while (done < seqs.get(sequence.getSequenceName()).length) { - final boolean useBpl = bpl > 0 && rdn.nextDouble() < 0.10; // 10% of times we exactly add the same number of bases as bases-per-line, remaining bases permitting. - int left = sequence.getSequenceLength() - done; - final int length = useBpl ? Math.min(bpl, left) : rdn.nextInt(left) + 1; - Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, length), writer); - done += length; - left -= length; - if (useBpl && rdn.nextDouble() < 0.10) { // 10% of the time we align with bpl so that it will start a new line on the next write. - final int lengthToEndOfLine = Math.min(left, bpl - (done % bpl)); - Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, lengthToEndOfLine), writer); - done += lengthToEndOfLine; - } - if (rdn.nextDouble() < 0.10) { // 10% of the time we do a stupid zero length append. - Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, 0), writer); - } - } - } - } - } - - @DataProvider(name = "testData") - public Object[][] testData() { - // data-signature: (SAMSequenceDictionary dictionary, boolean withDescriptions, int defaultBpl, int minBpl, int maxBpl, int seed - // defaultBpl == -1 means to use the default {@link FastaReferenceWriter#DEFAULT_BASE_PER_LINE}. - // [minBpl , manBpl] range for possible bpl when the default for the file is not to be used. - final Random rdn = new Random(113); - final SAMSequenceDictionary typicalDictionary = new SAMSequenceDictionary( - Arrays.asList(new SAMSequenceRecord("chr1", 10_000), - new SAMSequenceRecord("chr2", 20_000), - new SAMSequenceRecord("chr3", 20_000), - new SAMSequenceRecord("chr4", 2_000), - new SAMSequenceRecord("chr5", 200), - new SAMSequenceRecord("chr6", 3_010), - new SAMSequenceRecord("X", 1_000) - )); - - final SAMSequenceDictionary manyBPLMatchingSequences = new SAMSequenceDictionary( - IntStream.range(0, 100) - .mapToObj(i -> new SAMSequenceRecord("" + (i + 1), FastaReferenceWriter.DEFAULT_BASES_PER_LINE * (rdn.nextInt(10) + 1))) - .collect(Collectors.toList()) - ); - - final SAMSequenceDictionary singleSequence = new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("seq", 2_000))); - - final SAMSequenceDictionary oneBaseSequencesContaining = new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("chr1", 10_000), - new SAMSequenceRecord("chr2", 20_000), - new SAMSequenceRecord("chr2.small", 1), - new SAMSequenceRecord("X", 1_000), - new SAMSequenceRecord("MT", 1)) - ); - - final SAMSequenceDictionary[] testDictionaries = new SAMSequenceDictionary[]{typicalDictionary, manyBPLMatchingSequences, singleSequence, oneBaseSequencesContaining}; - final int[] testBpls = new int[]{-1, FastaReferenceWriter.DEFAULT_BASES_PER_LINE, 1, 100, 51, 63}; - final boolean[] testWithDescriptions = new boolean[]{true, false}; - final boolean[] testWithIndex = new boolean[]{true, false}; - final boolean[] testWithDictionary = new boolean[]{true, false}; - final int[] testSeeds = new int[]{31, 113, 73}; - final List result = new ArrayList<>(); - for (final SAMSequenceDictionary dictionary : testDictionaries) { - for (final boolean withIndex : testWithIndex) { - for (final boolean withDictionary : testWithDictionary) { - for (final boolean withDescriptions : testWithDescriptions) { - for (final int bpl : testBpls) { - for (final int seed : testSeeds) { - result.add(new Object[]{dictionary, withIndex, withDictionary, withDescriptions, bpl, 1, (bpl < 0 ? FastaReferenceWriter.DEFAULT_BASES_PER_LINE : bpl) * 2, seed}); - } - } - } - } - } - } - return result.toArray(new Object[result.size()][]); - } -}