- * The two main operations that one can invoke on a opened writer is {@link #startSequence} and {@link #appendBases}.
- * The former indicates that we are going to append a new sequence to the output and is invoked once per sequence.
- * The latter adds bases to the current sequence and can be called as many times as is needed.
- *
- *
- * The writer will make sure that the output adheres to the FASTA reference sequence file format restrictions:
- *
- *
Sequence names are valid (non-empty, without space/blank, control characters),
- *
sequence description are valid (without control characters),
- *
bases are valid nucleotides ore IUPAC redundancy codes and X [ACGTNX...] (lower or uppercase are accepted),
- *
sequence cannot have 0 length,
- *
and that each sequence can only appear once in the output
- *
- *
- */
-public final class FastaReferenceWriter implements AutoCloseable {
-
- /**
- * Default number of bases per line.
- */
- public static final int DEFAULT_BASES_PER_LINE = 60;
-
- /**
- * Sequence header start character.
- */
- public static final char HEADER_START_CHAR = '>';
-
- /**
- * Character used to separate the sequence name and the description if any.
- */
- public static final char HEADER_NAME_AND_DESCRIPTION_SEPARATOR = ' ';
-
- /**
- * Charset used for all outputs; fixed to UTF-8.
- */
- private static final Charset CHARSET = Charset.forName("UTF-8");
-
- /**
- * The line separator string.
- */
- private static final char LINE_SEPARATOR_CHR = '\n';
-
- /**
- * Character used to separate the fields in a index file line.
- */
- private static final char INDEX_FIELD_SEPARATOR_CHR = '\t';
-
- /**
- * Convenient cached {@code byte[]} representation of the line separator.
- */
- private static final byte[] LINE_SEPARATOR = String.valueOf(LINE_SEPARATOR_CHR).getBytes(CHARSET);
-
- /**
- * Output stream to the main FASTA output.
- *
- * We use it also to count the number of bytes so far outputted thus the offset included in
- * the index file entry.
- *
- */
- private final CountingOutputStream fastaStream;
-
- /**
- * Writer for the index file.
- */
- private final Writer indexWriter;
-
- /**
- * Output writer to the output dictionary.
- */
- private final Writer dictWriter;
-
- /**
- * Output codec for the dictionary.
- */
- private final SAMSequenceDictionaryCodec dictCodec;
-
- /**
- * Default number of bases per line to be applied unless one is
- */
- private final int defaultBasePerLine;
-
- /**
- * Records the sequences that have been already fully appended to this writer.
- *
- * The key is the sequence name.
- *
- *
- * The value is the sequence length in bases.
- *
- */
- private final Map sequenceNamesAndSizes;
-
- /**
- * Bases per line to be applied to the sequence that is been currently appended to the output.
- */
- private int currentBasesPerLine;
-
- /**
- * Holds the number of bases in the current output line.
- */
- private int currentLineBasesCount;
-
- /**
- * Holds the number of bases so far appended for the current sequence.
- */
- private long currentBasesCount;
-
- /**
- * Holds the FASTA output file offset for the current sequence.
- */
- private long currentSequenceOffset;
-
- /**
- * Holds the name of the sequence that is been appended currently.
- */
- private String currentSequenceName;
-
- /**
- * Flag indicating whether this writer has been already closed.
- */
- private boolean closed;
-
- /**
- * Creates a reference FASTA file writer.
- *
- * The default bases-per-line is set to {@link #DEFAULT_BASES_PER_LINE}.
- *
- *
- * Names for the fasta index and dictionary are constructed from the FASTA output file using common practices
- * as resolved by {@link ReferenceSequenceFileFactory#getFastaIndexFileName(Path)}
- * and {@link ReferenceSequenceFileFactory#getDefaultDictionaryForReferenceSequence(Path)}
- * respectively.
- *
- *
- * @param fastaFile the output fasta file path.
- * @param makeFaiOutput whether an index must be generated.
- * @param makeDictOutput whether a dictionary must be generated.
- * @throws IllegalArgumentException if {@code fastaFile} is {@code null}.
- * @throws IOException if such exception is thrown when accessing the output path resources.
- */
- public FastaReferenceWriter(final Path fastaFile, final boolean makeFaiOutput, final boolean makeDictOutput)
- throws IOException
- {
- this(fastaFile, DEFAULT_BASES_PER_LINE, makeFaiOutput, makeDictOutput);
- }
-
- /**
- * Creates a reference FASTA file writer.
- *
- * Names for the fasta index and dictionary are constructed from the FASTA output file using common practices
- * as resolved by {@link ReferenceSequenceFileFactory#getFastaIndexFileName(Path)}
- * and {@link ReferenceSequenceFileFactory#getDefaultDictionaryForReferenceSequence(Path)}
- * respectively.
- *
- *
- * @param fastaFile the output fasta file path.
- * @param basesPerLine default bases per line.
- * @param makeFaiOutput whether an index must be generated.
- * @param makeDictOutput whether a dictionary must be generated.
- * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative.
- * @throws IOException if such exception is thrown when accessing the output path resources.
- */
- public FastaReferenceWriter(final Path fastaFile, final int basesPerLine, final boolean makeFaiOutput,
- final boolean makeDictOutput)
- throws IOException
- {
- this(Utils.nonNull(fastaFile, "the output fasta-file cannot be null"),
- basesPerLine,
- defaultFaiFile(makeFaiOutput, fastaFile),
- defaultDictFile(makeDictOutput, fastaFile));
- }
-
- /**
- * Creates a reference FASTA file writer.
- *
- * The default bases-per-line is set to {@link #DEFAULT_BASES_PER_LINE}.
- *
- *
- * You can specify a specific path for the index and dictionary file. If either set to {@code null} such
- * a file won't be generated.
- *
- *
- * @param fastaFile the output fasta file path.
- * @param indexFile the path of the index file, if requested, {@code null} if none should be generated.
- * @param dictFile the path of the dictFile, if requested, {@code null} if nono should be generated.
- * @throws IllegalArgumentException if {@code fastaFile} is {@code null}.
- * @throws IOException if such exception is thrown when accessing the output path resources.
- */
- public FastaReferenceWriter(final Path fastaFile, final Path indexFile, final Path dictFile)
- throws IOException
- {
- this(fastaFile, DEFAULT_BASES_PER_LINE, indexFile, dictFile);
- }
-
- /**
- * Creates a reference FASTA file writer.
- *
- * You can specify a specific path for the index and dictionary file. If either set to {@code null} such
- * a file won't be generated.
- *
- *
- * @param fastaFile the output fasta file path.
- * @param indexFile the path of the index file, if requested, {@code null} if none should be generated.
- * @param dictFile the path of the dictFile, if requested, {@code null} if nono should be generated.
- * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative.
- * @throws IOException if such exception is thrown when accessing the output path resources.
- */
- public FastaReferenceWriter(final Path fastaFile, final int basesPerLine, final Path indexFile, final Path dictFile)
- throws IOException
- {
- // This code is a slight repeat of {@link #FastaReferenceWriter(OutputStream,int,OutputStream,OutputStream)
- // for the sake of avoiding creating output if basesPerLine is invalid.
- this.defaultBasePerLine = checkBasesPerLine(basesPerLine);
-
- this.fastaStream = new CountingOutputStream(new BufferedOutputStream(Files.newOutputStream(Utils.nonNull(fastaFile))));
- this.indexWriter = indexFile == null ? new NullWriter() : new OutputStreamWriter(Files.newOutputStream(indexFile), CHARSET);
- final BufferedWriter dictWriter = new BufferedWriter(dictFile == null ? new NullWriter() : new OutputStreamWriter(Files.newOutputStream(dictFile), CHARSET));
- this.dictWriter = dictWriter;
- this.dictCodec = new SAMSequenceDictionaryCodec(dictWriter);
- this.dictCodec.encodeHeaderLine(false);
- this.sequenceNamesAndSizes = new LinkedHashMap<>();
- }
-
- /**
- * Creates a reference FASTA file writer.
- *
- * You can specify a specific output stream to each file: the main fasta output, its index and its dictionary.
- *
- *
- * @param fastaOutput the output fasta file path.
- * @param indexOutput the output stream to the index file, if requested, {@code null} if none should be generated.
- * @param dictOutput the output stream to the dictFile, if requested, {@code null} if none should be generated.
- * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative.
- */
- public FastaReferenceWriter(final OutputStream fastaOutput,
- final int basesPerLine,
- final OutputStream indexOutput,
- final OutputStream dictOutput) {
- this.defaultBasePerLine = checkBasesPerLine(basesPerLine);
- this.fastaStream = new CountingOutputStream(Utils.nonNull(fastaOutput));
- this.indexWriter = indexOutput == null ? new NullWriter() : new OutputStreamWriter(indexOutput, CHARSET);
- final BufferedWriter dictWriter = new BufferedWriter(dictOutput == null ? new NullWriter() : new OutputStreamWriter(dictOutput, CHARSET));
- this.dictWriter = dictWriter;
- this.dictCodec = new SAMSequenceDictionaryCodec(dictWriter);
- this.dictCodec.encodeHeaderLine(false);
- this.sequenceNamesAndSizes = new LinkedHashMap<>();
- }
-
- private static Path defaultFaiFile(final boolean makeFaiFile, final Path fastaFile) {
- return makeFaiFile ? ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) : null;
- }
-
- private static Path defaultDictFile(final boolean makeDictFile, final Path fastaFile) {
- return makeDictFile ? ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaFile) : null;
- }
-
- // checks that a sequence name is valid.
- private static void checkSequenceName(final String name) {
- Utils.nonNull(name, "the sequence name cannot be null");
- Utils.validateArg(!name.isEmpty(), "the input sequence name cannot be null");
- for (int i = 0; i < name.length(); i++) {
- final char ch = name.charAt(i);
- if (Character.isWhitespace(ch)) {
- throw new IllegalArgumentException("the input name contains blank characters: '" + StringUtils.escape(name) + "'");
- } else if (Character.isISOControl(ch)) {
- throw new IllegalArgumentException("the input name contains control characters: '" + StringUtils.escape(name) + "'");
- }
- }
- }
-
- private static void checkSequenceBases(final byte[] bases, final int offset, final int length) {
- Utils.nonNull(bases, "the input bases array cannot be null");
- final int to = offset + length;
- for (int i = offset; i < to; i++) {
- final byte b = bases[i];
- if (!Nucleotide.decode(b).isValid()) {
- throw new IllegalArgumentException( "the input sequence contains invalid base calls like: "
- + StringUtils.escape(""+ (char) b));
- }
- }
- }
-
- private static String checkDescription(final String description) {
- if (description == null || description.isEmpty()) {
- return "";
- } else {
- for (int i = 0; i < description.length(); i++) {
- final char c = description.charAt(i);
- if (Character.isISOControl(c) && c != '\t') { // tab is the only valid control char in the description.
- throw new IllegalArgumentException("the input name contains non-tab control characters: '"
- + StringUtils.escape(description) + "'");
- }
- }
- return description;
- }
- }
-
- private static int checkBasesPerLine(final int value) {
- return ParamUtils.isPositive(value, "base per line must be 1 or greater");
- }
-
- /**
- * Starts the input of the bases of a new sequence.
- *
- * This operation automatically closes the previous sequence base input if any.
- *
- *
- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}),
- * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character
- * {@value #HEADER_START_CHAR}. It cannot be the empty string either ("").
- *
- *
- * No description is included in the output.
- *
- *
- * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE}
- * if none was provided.
- *
- *
- * This method cannot be called after the writer has been closed.
- *
- *
- * It also will fail if no base was added to the previous sequence if any.
- *
- * @param sequenceName the name of the new sequence.
- * @return this instance.
- * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence
- * with the same name has already been added to the writer.
- * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed.
- * @throws IOException if such exception is thrown when writing into the output resources.
- */
- public FastaReferenceWriter startSequence(final String sequenceName)
- throws IOException
- {
- return startSequence(sequenceName, "", defaultBasePerLine);
- }
-
- /**
- * Starts the input of the bases of a new sequence.
- *
- * This operation automatically closes the previous sequence base input if any.
- *
- *
- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}),
- * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character
- * {@value #HEADER_START_CHAR}. It cannot be the empty string either ("").
- *
- *
- * The input bases-per-line must be 1 or greater.
- *
- *
- * This method cannot be called after the writer has been closed.
- *
- *
- * It also will fail if no base was added to the previous sequence if any.
- *
- * @param sequenceName the name of the new sequence.
- * @param basesPerLine number of bases per line for this sequence.
- * @return this instance.
- * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence
- * with the same name has already been added to the writer.
- * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed.
- * @throws IOException if such exception is thrown when writing into the output resources.
- */
- public FastaReferenceWriter startSequence(final String sequenceName, final int basesPerLine)
- throws IOException
- {
- return startSequence(sequenceName, "", checkBasesPerLine(basesPerLine));
- }
-
- /**
- * Starts the input of the bases of a new sequence.
- *
- * This operation automatically closes the previous sequence base input if any.
- *
- *
- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}),
- * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character
- * {@value #HEADER_START_CHAR}. It cannot be the empty string either ("").
- *
- *
- * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty
- * string ("") no description will be outputted.
- *
- *
- * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE}
- * if none was provided.
- *
- *
- * This method cannot be called after the writer has been closed.
- *
- *
- * It also will fail if no base was added to the previous sequence if any.
- *
- * @param sequenceName the name of the new sequence.
- * @param description optional description for that sequence.
- * @return this instance.
- * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence
- * with the same name has already been added to the writer.
- * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed.
- * @throws IOException if such exception is thrown when writing into the output resources.
- */
- public FastaReferenceWriter startSequence(final String sequenceName, final String description)
- throws IOException
- {
- return startSequence(sequenceName, description, defaultBasePerLine);
- }
-
- /**
- * Starts the input of the bases of a new sequence.
- *
- * This operation automatically closes the previous sequence base input if any.
- *
- *
- * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}),
- * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header star character
- * {@value #HEADER_START_CHAR}. It cannot be the empty string either ("").
- *
- *
- * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty
- * string ("") no description will be outputted.
- *
- *
- * The input bases-per-line must be 1 or greater.
- *
- *
- * This method cannot be called after the writer has been closed.
- *
- *
- * It also will fail if no base was added to the previous sequence if any.
- *
- * @param sequenceName the name of the new sequence.
- * @param description optional description for that sequence.
- * @param basesPerLine number of bases per line for this sequence.
- * @return this instance.
- * @throws IllegalArgumentException if any argument does not comply with requirements listed above.
- * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed of
- * the sequence has been already added.
- * @throws IOException if such exception is thrown when writing into the output resources.
- */
- public FastaReferenceWriter startSequence(final String sequenceName, final String description, final int basesPerLine)
- throws IOException
- {
- assertIsNotClosed();
- checkSequenceName(sequenceName);
- final String nonNullDescription = checkDescription(description);
- checkBasesPerLine(basesPerLine);
- closeSequence();
- if (sequenceNamesAndSizes.containsKey(sequenceName)) {
- throw new IllegalStateException("the input sequence name '" + sequenceName + "' has already been added");
- }
- currentSequenceName = sequenceName;
- currentBasesPerLine = basesPerLine;
- final StringBuilder builder = new StringBuilder(sequenceName.length() + nonNullDescription.length() + 10);
- builder.append(HEADER_START_CHAR).append(sequenceName);
- if (!nonNullDescription.isEmpty()) {
- builder.append(HEADER_NAME_AND_DESCRIPTION_SEPARATOR).append(nonNullDescription);
- }
- fastaStream.write(builder.toString().getBytes(CHARSET));
- fastaStream.write(LINE_SEPARATOR);
- currentSequenceOffset = fastaStream.getCount();
- return this;
- }
-
- private void closeSequence()
- throws IOException
- {
- if (currentSequenceName != null) {
- if (currentBasesCount == 0) {
- throw new IllegalStateException("startSequence was called but no base was added");
- }
- sequenceNamesAndSizes.put(currentSequenceName, currentBasesCount);
- writeIndexEntry();
- writeDictEntry();
- fastaStream.write(LINE_SEPARATOR);
- currentBasesCount = 0;
- currentLineBasesCount = 0;
- currentSequenceName = null;
- }
- }
-
- private void writeIndexEntry()
- throws IOException
- {
- indexWriter.append(currentSequenceName).append(INDEX_FIELD_SEPARATOR_CHR)
- .append(String.valueOf(currentBasesCount)).append(INDEX_FIELD_SEPARATOR_CHR)
- .append(String.valueOf(currentSequenceOffset)).append(INDEX_FIELD_SEPARATOR_CHR)
- .append(String.valueOf(currentBasesPerLine)).append(INDEX_FIELD_SEPARATOR_CHR)
- .append(String.valueOf(currentBasesPerLine + LINE_SEPARATOR.length)).append(LINE_SEPARATOR_CHR);
- }
-
- private void writeDictEntry() {
- dictCodec.encodeSequenceRecord(new SAMSequenceRecord(currentSequenceName, (int) currentBasesCount));
- }
-
- /**
- * Adds bases to current sequence from a {@code byte} array.
- *
- * @param bases array containing the bases to be added.
- * @return this instance.
- * @throws IllegalArgumentException if {@bases} is {@code null} or
- * the input array contains invalid bases (as assessed by: {@link Nucleotide#decode(byte)}).
- * @throws IllegalStateException if no sequence was started or the writer is already closed.
- * @throws IOException if such exception is throw when writing in any of the outputs.
- */
- public FastaReferenceWriter appendBases(final byte[] bases)
- throws IOException
- {
- return appendBases(bases, 0, bases.length);
- }
-
- /**
- * Adds bases to current sequence from a range in a {@code byte} array.
- *
- * @param bases array containing the bases to be added.
- * @param offset the position of the first base to add.
- * @param length how many bases to be added starting from position {@code offset}.
- * @return this instance.
- * @throws IllegalArgumentException if {@bases} is {@code null} or
- * {@code offset} and {@code length} do not entail a valid range in {@code bases} or
- * that range in {@base} contain invalid bases (as assessed by: {@link Nucleotide#decode(byte)}).
- * @throws IllegalStateException if no sequence was started or the writer is already closed.
- * @throws IOException if such exception is throw when writing in any of the outputs.
- */
- public FastaReferenceWriter appendBases(final byte[] bases, final int offset, final int length)
- throws IOException
- {
- assertIsNotClosed();
- assertSequenceOpen();
- checkSequenceBases(bases, offset, length);
- ParamUtils.isPositiveOrZero(offset, "the input offset cannot be negative");
- ParamUtils.isPositiveOrZero(length, "the input length must not be negative");
- final int to = offset + length;
- Utils.validateArg(to <= bases.length, "the length + offset goes beyond the end of " +
- "the input base array: '" + to + "' > '" + bases.length + "'");
-
- int next = offset;
- while (next < to) {
- if (currentLineBasesCount == currentBasesPerLine) {
- fastaStream.write(LINE_SEPARATOR);
- currentLineBasesCount = 0;
- }
- final int nextLength = Math.min(to - next, currentBasesPerLine - currentLineBasesCount);
- fastaStream.write(bases, next, nextLength);
- currentLineBasesCount += nextLength;
- next += nextLength;
- }
- currentBasesCount += length;
- return this;
- }
-
- /**
- * Appends a new sequence to the output.
- *
- * This is a convenient short handle for {@code startSequence(name).appendBases(bases)}.
- *
- *
- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}.
- *
- * @param name the name of the new sequence.
- * @param bases the (first) bases of the sequence.
- * @return a reference to this very same writer.
- * @throws IOException if such an exception is thrown when actually writing into the output streams/channels.
- * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names).
- * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence
- * with such name was already appended to this writer.
- */
- public FastaReferenceWriter appendSequence(final String name, final byte[] bases) throws IOException {
- return startSequence(name).appendBases(bases);
- }
-
- /**
- * Appends a new sequence to the output with or without a description.
- *
- * This is a convenient short handle for {@code startSequence(name, description).appendBases(bases)}.
- *
- *
- * A {@code null} or empty ("") description will be ignored (no description will be output).
- *
- *
- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}.
- *
- * @param name the name of the new sequence.
- * @param bases the (first) bases of the sequence.
- * @param description the description for the new sequence.
- * @return a reference to this very same writer.
- * @throws IOException if such an exception is thrown when actually writing into the output streams/channels.
- * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when
- * the {@code description} contains unsupported characters.
- * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence
- * with such name was already appended to this writer.
- */
- public FastaReferenceWriter appendSequence(final String name, final String description, final byte[] bases) throws IOException {
- return startSequence(name, description).appendBases(bases);
- }
-
- /**
- * Appends a new sequence to the output with or without a description and an alternative number of bases-per-line.
- *
- * This is a convenient short handle for {@code startSequence(name, description, bpl).appendBases(bases)}.
- *
- *
- * A {@code null} or empty ("") description will be ignored (no description will be output).
- *
- *
- * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}.
- *
- * @param name the name of the new sequence.
- * @param bases the (first) bases of the sequence.
- * @param description the description for the sequence.
- * @param basesPerLine alternative number of bases per line to be used for the sequence.
- * @return a reference to this very same writer.
- * @throws IOException if such an exception is thrown when actually writing into the output streams/channels.
- * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when the
- * {@code description} contains unsupported characters or {@code basesPerLine} is 0 or negative.
- * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence
- * with such name was already appended to this writer.
- */
- public FastaReferenceWriter appendSequence(final String name, final String description, final int basesPerLine, final byte[] bases) throws IOException {
- return startSequence(name, description, basesPerLine).appendBases(bases);
- }
-
- private void assertSequenceOpen() {
- if (currentSequenceName == null) {
- throw new IllegalStateException("trying to add bases without starting a sequence");
- }
- }
-
- private void assertIsNotClosed() {
- if (closed) {
- throw new IllegalStateException("already closed");
- }
- }
-
- /**
- * Closes this writer flushing all remaining writing operation input the output resources.
- *
- * Further calls to {@link #appendBases} or {@link #startSequence} will result in an exception.
- *
- *
- * @throws IOException if such exception is thrown when closing output writers and output streams.
- * @throws IllegalStateException if closing without writing any sequences or closing when writing a sequence is in progress
- */
- public void close() throws IOException
- {
- if (!closed) {
- try {
- closeSequence();
- if (sequenceNamesAndSizes.isEmpty()) {
- throw new IllegalStateException("no sequences where added to the reference");
- }
- } finally {
- closed = true;
- fastaStream.close();
- indexWriter.close();
- dictWriter.close();
- }
- }
- }
-
- /**
- * Convenient method to write a FASTA file with a single sequence.
- *
- * @param whereTo the path to. must not be null.
- * @param makeIndex whether the index file should be written at its standard location.
- * @param makeDict whether the dictionary file should be written at it standard location.
- * @param name the sequence name, cannot contain white space, or control chracter or the header start character.
- * @param description the sequence description, "" if no description.
- * @param bases the sequence bases, cannot be {@code null}.
- * @throws IOException if such exception is thrown when writing in the output resources.
- */
- public static void writeSingleSequenceReference(final Path whereTo, final boolean makeIndex,
- final boolean makeDict, final String name,
- final String description, final byte[] bases)
- throws IOException
- {
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(whereTo, makeIndex, makeDict)) {
- writer.startSequence(name, description);
- writer.appendBases(bases);
- }
- }
-
- /**
- * Convenient method to write a FASTA file with a single sequence.
- *
- * @param whereTo the path to. must not be null.
- * @param basesPerLine number of bases per line. must be 1 or greater.
- * @param makeIndex whether the index file should be written at its standard location.
- * @param makeDict whether the dictionary file should be written at it standard location.
- * @param name the sequence name, cannot contain white space, or control chracter or the header start character.
- * @param description the sequence description, "" if no description.
- * @param bases the sequence bases, cannot be {@code null}.
- * @throws IOException if such exception is thrown when writing in the output resources.
- */
- public static void writeSingleSequenceReference(final Path whereTo, final int basesPerLine, final boolean makeIndex,
- final boolean makeDict, final String name,
- final String description, final byte[] bases)
- throws IOException
- {
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(whereTo, basesPerLine, makeIndex, makeDict)) {
- writer.startSequence(name, description);
- writer.appendBases(bases);
- }
- }
-}
diff --git a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
index 4dd8958a300..53981307ed3 100644
--- a/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSinkUnitTest.java
@@ -178,7 +178,7 @@ private void assertSingleShardedWritingWorks(String inputBam, String referenceFi
// check that a bai file is created
if (IOUtils.isBamFileName(outputPath) && writeBai) {
- Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + BAMIndex.BAMIndexSuffix)));
+ Assert.assertTrue(Files.exists(IOUtils.getPath(outputPath + BAMIndex.BAI_INDEX_SUFFIX)));
}
// check that a splitting bai file is created
if (IOUtils.isBamFileName(outputPath) && writeSbi) {
diff --git a/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
index 3cc4848274a..36aad372f21 100644
--- a/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
+++ b/src/test/java/org/broadinstitute/hellbender/testutils/testers/MarkDuplicatesSparkTester.java
@@ -3,10 +3,7 @@
import htsjdk.samtools.*;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import htsjdk.samtools.metrics.MetricsFile;
-import htsjdk.samtools.util.CloseableIterator;
-import htsjdk.samtools.util.CloserUtil;
-import htsjdk.samtools.util.FormatUtil;
-import htsjdk.samtools.util.TestUtil;
+import htsjdk.samtools.util.*;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
@@ -158,7 +155,7 @@ public void test() {
Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected");
Assert.assertEquals(observedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, expectedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, "SECONDARY_OR_SUPPLEMENTARY_RDS does not match expected");
} finally {
- TestUtil.recursiveDelete(getOutputDir());
+ IOUtil.recursiveDelete(getOutputDir().toPath());
}
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java
index 09c6132250d..25a6f11fd74 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/IndexFeatureFileIntegrationTest.java
@@ -209,7 +209,8 @@ public void testBCFIndex() {
checkIndex(index, Arrays.asList("1"));
}
- @Test(expectedExceptions = UserException.CouldNotIndexFile.class)
+ // test disabled until https://github.com/samtools/htsjdk/issues/1323 is resolved
+ @Test(enabled = false)
public void testUncompressedBCF2_2Index() {
final File ORIG_FILE = getTestFile("test_variants_for_index.BCF22uncompressed.bcf");
final File outName = createTempFile("test_variants_for_index.BCF22uncompressed.bcf", ".idx");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
index 71bfff76927..eb00a6d8838 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndexIntegrationTest.java
@@ -114,7 +114,7 @@ public ArgumentsBuilder getInputAndOutputArgs(final File inputFile, final File s
@Test
public void testCreateWithBaiCreatesBai(){
final File splittingIndex = getTempIndexFile();
- final File baiIndex = IOUtils.replaceExtension(splittingIndex, BAMIndex.BAMIndexSuffix);
+ final File baiIndex = IOUtils.replaceExtension(splittingIndex, BAMIndex.BAI_INDEX_SUFFIX);
Assert.assertFalse(baiIndex.exists());
final ArgumentsBuilder args = getInputAndOutputArgs(SORTED_BAM, splittingIndex)
.add("--" + CreateHadoopBamSplittingIndex.CREATE_BAI_LONG_NAME);
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java
deleted file mode 100644
index ffe7f7c30cc..00000000000
--- a/src/test/java/org/broadinstitute/hellbender/utils/reference/FastaReferenceWriterUnitTest.java
+++ /dev/null
@@ -1,518 +0,0 @@
-package org.broadinstitute.hellbender.utils.reference;
-
-import htsjdk.samtools.SAMFileHeader;
-import htsjdk.samtools.SAMSequenceDictionary;
-import htsjdk.samtools.SAMSequenceRecord;
-import htsjdk.samtools.reference.FastaSequenceIndex;
-import htsjdk.samtools.reference.IndexedFastaSequenceFile;
-import htsjdk.samtools.reference.ReferenceSequence;
-import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
-import htsjdk.samtools.util.SequenceUtil;
-import org.broadinstitute.hellbender.GATKBaseTest;
-import org.broadinstitute.hellbender.engine.ReadsDataSource;
-import org.broadinstitute.hellbender.utils.RandomDNA;
-import org.testng.Assert;
-import org.testng.annotations.DataProvider;
-import org.testng.annotations.Test;
-
-import java.io.*;
-import java.net.URISyntaxException;
-import java.nio.file.Path;
-import java.security.GeneralSecurityException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
-import java.util.stream.Stream;
-
-/**
- * Unit tests for {@link FastaReferenceWriter}.
- */
-public class FastaReferenceWriterUnitTest extends GATKBaseTest {
-
- public static void assertOutput(final Path path, final boolean mustHaveIndex, final boolean mustHaveDictionary,
- final boolean withDescriptions, final SAMSequenceDictionary dictionary, final int defaultBpl,
- final Map bases, final Map basesPerLine)
- throws IOException {
- assertFastaContent(path, withDescriptions, dictionary, defaultBpl, bases, basesPerLine);
- if (mustHaveDictionary) {
- assertFastaDictionaryContent(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(path), dictionary);
- }
- if (mustHaveIndex) {
- assertFastaIndexContent(path, ReferenceSequenceFileFactory.getFastaIndexFileName(path), dictionary, bases);
- }
- }
-
- public static void assertFastaContent(final Path path, final boolean withDescriptions, final SAMSequenceDictionary dictionary, final int defaultBpl,
- final Map bases, final Map basesPerLine)
- throws IOException {
- try (final BufferedReader reader = new BufferedReader(new InputStreamReader(path.getFileSystem().provider().newInputStream(path)))) {
- for (final SAMSequenceRecord sequence : dictionary.getSequences()) {
- final String description = String.format("index=%d\tlength=%d",
- dictionary.getSequenceIndex(sequence.getSequenceName()), sequence.getSequenceLength());
- final String expectedHeader = FastaReferenceWriter.HEADER_START_CHAR + sequence.getSequenceName()
- + ((withDescriptions) ? FastaReferenceWriter.HEADER_NAME_AND_DESCRIPTION_SEPARATOR + description : "");
- Assert.assertTrue(reader.readLine().startsWith(expectedHeader));
- final byte[] expectedBases = bases.get(sequence.getSequenceName());
- final int bpl_ = basesPerLine.get(sequence.getSequenceName());
- final int bpl = bpl_ < 0 ? (defaultBpl < 0 ? FastaReferenceWriter.DEFAULT_BASES_PER_LINE : defaultBpl) : bpl_;
- int offset = 0;
- while (offset < expectedBases.length) {
- final int expectedLength = Math.min(expectedBases.length - offset, bpl);
- final byte[] expectedBaseLine = SequenceUtil.upperCase(
- Arrays.copyOfRange(expectedBases, offset, offset + expectedLength));
- final byte[] actualBaseLine = SequenceUtil.upperCase(reader.readLine().getBytes());
- Assert.assertEquals(actualBaseLine, expectedBaseLine);
- offset += expectedLength;
- }
- }
- }
- }
-
- public static void assertFastaIndexContent(final Path path, final Path indexPath, final SAMSequenceDictionary dictionary,
- final Map bases) {
- final FastaSequenceIndex index = new FastaSequenceIndex(indexPath);
- final IndexedFastaSequenceFile indexedFasta = new IndexedFastaSequenceFile(path, index);
- for (final SAMSequenceRecord sequence : dictionary.getSequences()) {
- final String name = sequence.getSequenceName();
- final int length = sequence.getSequenceLength();
- final ReferenceSequence start = indexedFasta.getSubsequenceAt(name, 1, Math.min(length, 30));
- final ReferenceSequence end = indexedFasta.getSubsequenceAt(name, Math.max(1, length - 29), length);
- final int middlePos = Math.max(1, Math.min(length, length / 2));
- final ReferenceSequence middle = indexedFasta.getSubsequenceAt(name, middlePos, Math.min(middlePos + 29, length));
- Assert.assertEquals(start.getBases(), Arrays.copyOfRange(bases.get(name), 0, start.length()));
- Assert.assertEquals(end.getBases(), Arrays.copyOfRange(bases.get(name), Math.max(0, length - 30), length));
- Assert.assertEquals(middle.getBases(), Arrays.copyOfRange(bases.get(name), middlePos - 1, middlePos - 1 + middle.length()));
- }
- }
-
- public static void assertFastaDictionaryContent(final Path dictPath, final SAMSequenceDictionary dictionary) {
- final ReadsDataSource readsDataSource = new ReadsDataSource(dictPath);
- final SAMFileHeader actualHeader = readsDataSource.getHeader();
- final SAMSequenceDictionary actualDictionary = actualHeader.getSequenceDictionary();
- dictionary.assertSameDictionary(actualDictionary);
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testEmptySequence() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1");
- writer.appendBases(new RandomDNA(113).nextBases(100));
- writer.startSequence("seq2");
- writer.startSequence("seq3");
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testEmptyReference() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try {
- new FastaReferenceWriter(testOutput.toPath(), false, false).close();
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testStartSequenceAfterClose() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false);
- writer.startSequence("seq1").appendBases(new byte[]{'A', 'C', 'G', 'T'});
- writer.close();
- try {
- writer.startSequence("seq2");
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testAddBasesAfterClose() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false);
- writer.startSequence("seq1").appendBases(new byte[]{'A', 'C', 'G', 'T'});
- writer.close();
- try {
- writer.appendBases(new byte[]{'A', 'A', 'A'});
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(dataProvider = "invalidBplData", expectedExceptions = IllegalArgumentException.class)
- public void testBadDefaultBasesPerLine(final int invalidBpl) throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- final File testIndexOutput = ReferenceSequenceFileFactory.getFastaIndexFileName(testOutput.toPath()).toFile();
- final File testDictOutput = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutput);
- Assert.assertTrue(testOutput.delete());
- try {
- new FastaReferenceWriter(testOutput.toPath(), invalidBpl, true, true);
- } finally {
- // make sure that no output file was created:
- Assert.assertFalse(testOutput.delete());
- Assert.assertFalse(testIndexOutput.delete());
- Assert.assertFalse(testDictOutput.delete());
- }
- }
-
- @Test(dataProvider = "invalidBplData", expectedExceptions = IllegalArgumentException.class)
- public void testBadSequenceBasesPerLine(final int invalidBpl) throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1", invalidBpl);
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testEmptySequenceAtTheEnd() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1");
- writer.appendBases(new RandomDNA(113).nextBases(100));
- writer.startSequence("seq2");
- writer.appendBases(new RandomDNA(13).nextBases(1001));
- writer.startSequence("seq3");
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testAppendBasesBeforeStartingSequence() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.appendBases(new RandomDNA(113).nextBases(100));
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testAddingSameSequenceTwice() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1");
- writer.appendBases(new RandomDNA(113).nextBases(100));
- writer.startSequence("seq2");
- writer.appendBases(new RandomDNA(114).nextBases(300));
- writer.startSequence("seq1");
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalStateException.class)
- public void testAddingSameSequenceRightAfter() throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1");
- writer.appendBases(new RandomDNA(113).nextBases(100));
- writer.startSequence("seq1");
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalArgumentException.class, dataProvider = "invalidNameData")
- public void testAddingInvalidSequenceName(final String invalidName) throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence(invalidName);
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @Test(expectedExceptions = IllegalArgumentException.class, dataProvider = "invalidDescriptionData")
- public void testAddingInvalidDescription(final String invalidDescription) throws IOException {
- final File testOutput = createTempFile("fwr-test", ".fasta");
- Assert.assertTrue(testOutput.delete());
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutput.toPath(), false, false)) {
- writer.startSequence("seq1", invalidDescription);
- } finally {
- Assert.assertTrue(testOutput.delete());
- }
- }
-
- @DataProvider(name = "invalidBplData")
- public Object[][] invalidBplData() {
- return IntStream.of(0, -1, -110)
- .mapToObj(i -> new Object[]{i}).toArray(Object[][]::new);
- }
-
- @DataProvider(name = "invalidNameData")
- public Object[][] invalidNameData() {
- return Stream.of("seq with spaces", "seq\twith\ttabs", "with blank", " ", "", "nnn\n", "rrr\r", null)
- .map(s -> new Object[]{s}).toArray(Object[][]::new);
- }
-
- @DataProvider(name = "invalidDescriptionData")
- public Object[][] invalidDescriptionData() {
- return Stream.of("\nwith control chars\nthat are not\0tabs\r", "with the null\0", "with nl\n")
- .map(s -> new Object[]{s}).toArray(Object[][]::new);
- }
-
- @Test(dataProvider = "testData")
- public void testWriter(final SAMSequenceDictionary dictionary, final boolean withIndex, final boolean withDictionary,
- final boolean withDescriptions, final int defaultBpl,
- final int minBpl, final int maxBpl, final int seed)
- throws IOException {
- final Map bases = new LinkedHashMap<>(dictionary.getSequences().size());
- final Map bpl = new LinkedHashMap<>(dictionary.getSequences().size());
- final Random rdn = new Random(seed);
- generateRandomBasesAndBpls(dictionary, minBpl, maxBpl, bases, bpl, rdn);
- final File fastaFile = createTempFile("fwr-test", ".fa");
- Assert.assertTrue(fastaFile.delete());
- final File fastaIndexFile = new File(fastaFile.getParentFile(), fastaFile.getName() + ".fai");
- final File dictFile = new File(fastaFile.getParentFile(), fastaFile.getName().replaceAll("\\.fa", ".dict"));
- fastaIndexFile.deleteOnExit();
- dictFile.deleteOnExit();
-
- try (final FastaReferenceWriter writer = defaultBpl < 0
- ? new FastaReferenceWriter(fastaFile.toPath(), withIndex, withDictionary)
- : new FastaReferenceWriter(fastaFile.toPath(), defaultBpl, withIndex, withDictionary)) {
- writeReference(writer, withDescriptions, rdn, dictionary, bases, bpl);
- }
- assertOutput(fastaFile.toPath(), withIndex, withDictionary, withDescriptions, dictionary, defaultBpl, bases, bpl);
- Assert.assertTrue(fastaFile.delete());
- Assert.assertEquals(fastaIndexFile.delete(), withIndex);
- Assert.assertEquals(dictFile.delete(), withDictionary);
- }
-
- @Test
- public void testSingleSequenceStaticWithBpl() throws IOException, GeneralSecurityException, URISyntaxException {
- final File testOutputFile = createTempFile("fwr-test", ".random0.fasta");
- final Map seqs = Collections.singletonMap("seqA", new RandomDNA(1341).nextBases(100));
- final Map bpls = Collections.singletonMap("seqA", 42);
- final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(
- Collections.singletonList(new SAMSequenceRecord("seqA", 100))
- );
- FastaReferenceWriter.writeSingleSequenceReference(testOutputFile.toPath(), 42,
- true, true, "seqA", null, seqs.get("seqA"));
- assertOutput(testOutputFile.toPath(), true, true, false, dictionary, 42, seqs, bpls);
- Assert.assertTrue(testOutputFile.delete());
- Assert.assertTrue(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutputFile).delete());
- Assert.assertTrue(ReferenceSequenceFileFactory.getFastaIndexFileName(testOutputFile.toPath()).toFile().delete());
- }
-
- @Test
- public void testSingleSequenceStatic() throws IOException, GeneralSecurityException, URISyntaxException {
- final File testOutputFile = createTempFile("fwr-test", ".random0.fasta");
- final Map seqs = Collections.singletonMap("seqA", new RandomDNA(1341).nextBases(100));
- final Map bpls = Collections.singletonMap("seqA", FastaReferenceWriter.DEFAULT_BASES_PER_LINE);
- final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(
- Collections.singletonList(new SAMSequenceRecord("seqA", 100))
- );
- FastaReferenceWriter.writeSingleSequenceReference(testOutputFile.toPath(),
- true, true, "seqA", null, seqs.get("seqA"));
- assertOutput(testOutputFile.toPath(), true, true, false, dictionary, FastaReferenceWriter.DEFAULT_BASES_PER_LINE, seqs, bpls);
- Assert.assertTrue(testOutputFile.delete());
- Assert.assertTrue(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(testOutputFile).delete());
- Assert.assertTrue(ReferenceSequenceFileFactory.getFastaIndexFileName(testOutputFile.toPath()).toFile().delete());
- }
-
- @Test
- public void testAlternativeIndexAndDictFileNames() throws IOException, GeneralSecurityException, URISyntaxException {
- final File testOutputFile = createTempFile("fwr-test", ".random0.fasta");
- final File testIndexOutputFile = createTempFile("fwr-test", ".random1.fai");
- final File testDictOutputFile = createTempFile("fwr-test", ".random2.dict");
- final SAMSequenceDictionary testDictionary = new SAMSequenceDictionary(
- Collections.singletonList(new SAMSequenceRecord("seq1", 100))
- );
- final Map seqs = Collections.singletonMap("seq1", new RandomDNA(1341).nextBases(100));
- final Map bpls = Collections.singletonMap("seq1", -1);
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictOutputFile.toPath())) {
- writer.startSequence("seq1");
- writer.appendBases(seqs.get("seq1"));
- }
- assertFastaContent(testOutputFile.toPath(), false, testDictionary, -1, seqs, bpls);
- assertFastaIndexContent(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictionary, seqs);
- assertFastaDictionaryContent(testDictOutputFile.toPath(), testDictionary);
- Assert.assertTrue(testOutputFile.delete());
- Assert.assertTrue(testIndexOutputFile.delete());
- Assert.assertTrue(testDictOutputFile.delete());
- }
-
- @Test
- public void testDirectOutputStreams() throws IOException, GeneralSecurityException, URISyntaxException {
- final File testOutputFile = createTempFile("fwr-test", ".random0.fasta");
- final File testIndexOutputFile = createTempFile("fwr-test", ".random1.fai");
- final File testDictOutputFile = createTempFile("fwr-test", ".random2.dict");
- final SAMSequenceDictionary testDictionary = new SAMSequenceDictionary(
- Collections.singletonList(new SAMSequenceRecord("seq1", 100))
- );
- final Map seqs = Collections.singletonMap("seq1", new RandomDNA(1341).nextBases(100));
- final Map bpls = Collections.singletonMap("seq1", -1);
- try (final OutputStream testOutputStream = new FileOutputStream(testOutputFile);
- final OutputStream testIndexOutputStream = new FileOutputStream(testIndexOutputFile);
- final OutputStream testDictOutputStream = new FileOutputStream(testDictOutputFile)) {
- try (final FastaReferenceWriter writer = new FastaReferenceWriter(testOutputStream, 50, testIndexOutputStream, testDictOutputStream)) {
- writer.startSequence("seq1");
- writer.appendBases(seqs.get("seq1"));
- }
- }
- assertFastaContent(testOutputFile.toPath(), false, testDictionary, 50, seqs, bpls);
- assertFastaIndexContent(testOutputFile.toPath(), testIndexOutputFile.toPath(), testDictionary, seqs);
- assertFastaDictionaryContent(testDictOutputFile.toPath(), testDictionary);
- Assert.assertTrue(testOutputFile.delete());
- Assert.assertTrue(testIndexOutputFile.delete());
- Assert.assertTrue(testDictOutputFile.delete());
- }
-
- private void generateRandomBasesAndBpls(SAMSequenceDictionary dictionary, int minBpl, int maxBpl, Map bases, Map bpl, Random rdn) {
- final RandomDNA rdnDNA = new RandomDNA(rdn.nextLong());
- // We avoid to use the obvious first choice {@link RandomDNA#nextFasta} as these may actually use
- // this writer to do its job eventually.
- for (final SAMSequenceRecord sequence : dictionary.getSequences()) {
- bases.put(sequence.getSequenceName(), rdnDNA.nextBases(sequence.getSequenceLength()));
- if (rdn.nextDouble() < 0.333333) { // 1/3 of the time we will use the default.
- bpl.put(sequence.getSequenceName(), -1);
- } else {
- bpl.put(sequence.getSequenceName(), rdn.nextInt(maxBpl - minBpl + 1) + minBpl);
- }
- }
- }
-
-
-
- private static void writeReference(final FastaReferenceWriter writer, final boolean withDescriptions,
- final Random rdn, final SAMSequenceDictionary dictionary,
- final Map seqs,
- final Map basesPerLine)
- throws IOException {
- for (final SAMSequenceRecord sequence : dictionary.getSequences()) {
- final int bpl = basesPerLine.get(sequence.getSequenceName());
- final boolean onOneGo = rdn.nextDouble() < 0.25; // 25% of times we just write the whole sequence of one go.
- final boolean useAppendSequence = onOneGo && rdn.nextBoolean();
- if (withDescriptions) {
- final String description = String.format("index=%d\tlength=%d",
- dictionary.getSequenceIndex(sequence.getSequenceName()),
- sequence.getSequenceLength());
- if (bpl < 0) {
- if (useAppendSequence) {
- Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), description, seqs.get(sequence.getSequenceName())), writer);
- } else {
- Assert.assertSame(writer.startSequence(sequence.getSequenceName(), description), writer);
- }
- } else {
- if (useAppendSequence) {
- Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), description, bpl, seqs.get(sequence.getSequenceName())), writer);
- } else {
- Assert.assertSame(writer.startSequence(sequence.getSequenceName(), description, bpl), writer);
- }
- }
- } else {
- if (bpl < 0) {
- if (useAppendSequence) {
- Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), seqs.get(sequence.getSequenceName())), writer);
- } else {
- Assert.assertSame(writer.startSequence(sequence.getSequenceName()), writer);
- }
- } else {
- if (useAppendSequence) {
- Assert.assertSame(writer.appendSequence(sequence.getSequenceName(), null, bpl, seqs.get(sequence.getSequenceName())), writer);
- } else {
- Assert.assertSame(writer.startSequence(sequence.getSequenceName(), bpl), writer);
- }
- }
- }
- if (useAppendSequence) {
- // added already.
- } else if (onOneGo) {
- Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName())), writer);
- } else {
- int done = 0;
- while (done < seqs.get(sequence.getSequenceName()).length) {
- final boolean useBpl = bpl > 0 && rdn.nextDouble() < 0.10; // 10% of times we exactly add the same number of bases as bases-per-line, remaining bases permitting.
- int left = sequence.getSequenceLength() - done;
- final int length = useBpl ? Math.min(bpl, left) : rdn.nextInt(left) + 1;
- Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, length), writer);
- done += length;
- left -= length;
- if (useBpl && rdn.nextDouble() < 0.10) { // 10% of the time we align with bpl so that it will start a new line on the next write.
- final int lengthToEndOfLine = Math.min(left, bpl - (done % bpl));
- Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, lengthToEndOfLine), writer);
- done += lengthToEndOfLine;
- }
- if (rdn.nextDouble() < 0.10) { // 10% of the time we do a stupid zero length append.
- Assert.assertSame(writer.appendBases(seqs.get(sequence.getSequenceName()), done, 0), writer);
- }
- }
- }
- }
- }
-
- @DataProvider(name = "testData")
- public Object[][] testData() {
- // data-signature: (SAMSequenceDictionary dictionary, boolean withDescriptions, int defaultBpl, int minBpl, int maxBpl, int seed
- // defaultBpl == -1 means to use the default {@link FastaReferenceWriter#DEFAULT_BASE_PER_LINE}.
- // [minBpl , manBpl] range for possible bpl when the default for the file is not to be used.
- final Random rdn = new Random(113);
- final SAMSequenceDictionary typicalDictionary = new SAMSequenceDictionary(
- Arrays.asList(new SAMSequenceRecord("chr1", 10_000),
- new SAMSequenceRecord("chr2", 20_000),
- new SAMSequenceRecord("chr3", 20_000),
- new SAMSequenceRecord("chr4", 2_000),
- new SAMSequenceRecord("chr5", 200),
- new SAMSequenceRecord("chr6", 3_010),
- new SAMSequenceRecord("X", 1_000)
- ));
-
- final SAMSequenceDictionary manyBPLMatchingSequences = new SAMSequenceDictionary(
- IntStream.range(0, 100)
- .mapToObj(i -> new SAMSequenceRecord("" + (i + 1), FastaReferenceWriter.DEFAULT_BASES_PER_LINE * (rdn.nextInt(10) + 1)))
- .collect(Collectors.toList())
- );
-
- final SAMSequenceDictionary singleSequence = new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("seq", 2_000)));
-
- final SAMSequenceDictionary oneBaseSequencesContaining = new SAMSequenceDictionary(Arrays.asList(new SAMSequenceRecord("chr1", 10_000),
- new SAMSequenceRecord("chr2", 20_000),
- new SAMSequenceRecord("chr2.small", 1),
- new SAMSequenceRecord("X", 1_000),
- new SAMSequenceRecord("MT", 1))
- );
-
- final SAMSequenceDictionary[] testDictionaries = new SAMSequenceDictionary[]{typicalDictionary, manyBPLMatchingSequences, singleSequence, oneBaseSequencesContaining};
- final int[] testBpls = new int[]{-1, FastaReferenceWriter.DEFAULT_BASES_PER_LINE, 1, 100, 51, 63};
- final boolean[] testWithDescriptions = new boolean[]{true, false};
- final boolean[] testWithIndex = new boolean[]{true, false};
- final boolean[] testWithDictionary = new boolean[]{true, false};
- final int[] testSeeds = new int[]{31, 113, 73};
- final List