Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update HTSJDK to 4.1.1 and Picard to 3.2.0 #8900

Merged
merged 1 commit into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ repositories {
mavenLocal()
}

final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
final picardVersion = System.getProperty('picard.version','3.1.1')
final htsjdkVersion = System.getProperty('htsjdk.version','4.1.1')
final picardVersion = System.getProperty('picard.version','3.2.0')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
final sparkVersion = System.getProperty('spark.version', '3.5.0')
final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.ref.ReferenceContext;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.cram.structure.AlignmentContext;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class CRAMSupportUnitTest extends GATKBaseTest {

private static final int REFERENCE_SEQUENCE_ZERO = 0;
private static final int REFERENCE_CONTIG_LENGTH = 10000;

private static final SAMFileHeader SAM_FILE_HEADER = createSAMFileHeader();

private static SAMFileHeader createSAMFileHeader() {
final List<SAMSequenceRecord> sequenceRecords = new ArrayList<>();
sequenceRecords.add(new SAMSequenceRecord("0", REFERENCE_CONTIG_LENGTH));
sequenceRecords.add(new SAMSequenceRecord("1", REFERENCE_CONTIG_LENGTH));
final SAMFileHeader header = new SAMFileHeader(new SAMSequenceDictionary(sequenceRecords));
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
return header;
}

/*
* This is a unit test ported from CRAMReferenceRegionTest in HTSJDK that tests for the conditions that
* trigger the CRAM base corruption bug reported in https://github.com/broadinstitute/gatk/issues/8768 and
* fixed in HTSJDK 4.1.1 / GATK 4.6.0.0.
*
* This test fails using HTSJDK version 4.1.0, which predates the fix in https://github.com/samtools/htsjdk/pull/1708
* for the bug reported in https://github.com/broadinstitute/gatk/issues/8768
*
* Simulates the state transitions that occur when writing a CRAM file; specifically, use transitions that mirror
* the ones that occur when writing a CRAM with the conditions that affect (and that fail without the fix to)
* https://github.com/broadinstitute/gatk/issues/8768, i.e., a container with one or more containers with reads
* aligned to position 1 of a given contig, followed by one or more containers with reads aligned past position 1
* of the same contig.
*/
@Test
public void testCRAMBaseCorruptionBugIssue8768() {
// start with an entire reference sequence
final CRAMReferenceRegion cramReferenceRegion = getAlternatingReferenceRegion();
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
final long fullRegionFragmentLength = cramReferenceRegion.getRegionLength();
Assert.assertEquals(fullRegionFragmentLength, REFERENCE_CONTIG_LENGTH);

// transition to a shorter reference fragment using fetchReferenceBasesByRegion, then back to the full region
final int SHORT_FRAGMENT_LENGTH = 5;
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(REFERENCE_SEQUENCE_ZERO, 0, SHORT_FRAGMENT_LENGTH);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence; this is where the bug previously would have occurred
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
// this assert would fail without the fix
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);

// transition to a shorter region fragment length using fetchReferenceBasesByRegion(AlignmentContext), then back to the full region
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(
new AlignmentContext(
new ReferenceContext(REFERENCE_SEQUENCE_ZERO),
1,
SHORT_FRAGMENT_LENGTH));
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);
}

private static CRAMReferenceRegion getAlternatingReferenceRegion() {
return new CRAMReferenceRegion(
new ReferenceSource(getReferenceFileWithAlternatingBases(REFERENCE_CONTIG_LENGTH)),
SAM_FILE_HEADER.getSequenceDictionary());
}

private static InMemoryReferenceSequenceFile getReferenceFileWithAlternatingBases(final int length) {
final InMemoryReferenceSequenceFile referenceFile = new InMemoryReferenceSequenceFile();

// one contig with repeated ACGT...
final byte[] seq0Bases = getRepeatingBaseSequence(REFERENCE_CONTIG_LENGTH, false);
referenceFile.add("0", seq0Bases);

// one contig with repeated TGCA...
final byte[] seq1Bases = getRepeatingBaseSequence(REFERENCE_CONTIG_LENGTH, true);
referenceFile.add("1", seq1Bases);

return referenceFile;
}

// fill an array with the repeated base sequence "ACGTACGTACGT...", or reversed
private static byte[] getRepeatingBaseSequence(final int length, final boolean reversed) {
byte[] bases = new byte[length];
for (int i = 0; (i + 4) < bases.length; i += 4) {
bases[i] = (byte) (reversed ? 'T' : 'A');
bases[i+1] = (byte) (reversed ? 'G' : 'C');
bases[i+2] = (byte) (reversed ? 'C' : 'G');
bases[i+3] = (byte) (reversed ? 'A' : 'T');
}
return bases;
}
private static class InMemoryReferenceSequenceFile implements
ReferenceSequenceFile {
Map<String, ReferenceSequence> map = new HashMap<String, ReferenceSequence>();
List<String> index;
int current = 0;

public void add(final String name, final byte[] bases) {
final ReferenceSequence sequence = new ReferenceSequence(name,
map.size(), bases);
map.put(sequence.getName(), sequence);
}

@Override
public void reset() {
current = 0;
}

@Override
public ReferenceSequence nextSequence() {
if (current >= index.size()) return null;
return map.get(index.get(current++));
}

@Override
public boolean isIndexed() {
return true;
}

@Override
public ReferenceSequence getSubsequenceAt(final String contig, final long start,
final long stop) {
final ReferenceSequence sequence = getSequence(contig);
if (sequence == null) return null;
final byte[] bases = new byte[(int) (stop - start) + 1];
System.arraycopy(sequence.getBases(), (int) start - 1, bases, 0,
bases.length);
return new ReferenceSequence(contig, sequence.getContigIndex(),
bases);
}

@Override
public SAMSequenceDictionary getSequenceDictionary() {
return null;
}

@Override
public ReferenceSequence getSequence(final String contig) {
return map.get(contig);
}

@Override
public void close() throws IOException {
map.clear();
index.clear();
current = 0;
}
}
}
Loading