Skip to content

Commit

Permalink
adding new ReadQueryNameComparator (#4731)
Browse files Browse the repository at this point in the history
* added a new queryname comparator for GATKRead it operates on GATKRead instead of SAMRecord
* this matches the results of SAMRecordQueryNameComparator exactly
  • Loading branch information
lbergelson authored May 3, 2018
1 parent e331de3 commit ac34d0b
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package org.broadinstitute.hellbender.utils.read;

import htsjdk.samtools.SAMRecordQueryNameComparator;
import htsjdk.samtools.SAMTag;

import java.io.Serializable;
import java.util.Comparator;

/**
* compare {@link GATKRead} by queryname
* duplicates the exact ordering of {@link SAMRecordQueryNameComparator}
*/
public class ReadQueryNameComparator implements Comparator<GATKRead>, Serializable {
private static final long serialVersionUID = 1L;

@Override
public int compare(final GATKRead read1, final GATKRead read2) {
int cmp = compareReadNames(read1, read2);
if (cmp != 0) {
return cmp;
}

final boolean r1Paired = read1.isPaired();
final boolean r2Paired = read2.isPaired();

if (r1Paired || r2Paired) {
if (!r1Paired) return 1;
else if (!r2Paired) return -1;
else if (read1.isFirstOfPair() && read2.isSecondOfPair()) return -1;
else if (read1.isSecondOfPair() && read2.isFirstOfPair()) return 1;
}

if (read1.isReverseStrand() != read2.isReverseStrand()) {
return (read1.isReverseStrand()? 1: -1);
}
if (read1.isSecondaryAlignment() != read2.isSecondaryAlignment()) {
return read2.isSecondaryAlignment()? -1: 1;
}
if (read1.isSupplementaryAlignment() != read2.isSupplementaryAlignment()) {
return read2.isSupplementaryAlignment() ? -1 : 1;
}
final Integer hitIndex1 = read1.getAttributeAsInteger(SAMTag.HI.name());
final Integer hitIndex2 = read2.getAttributeAsInteger(SAMTag.HI.name());
if (hitIndex1 != null) {
if (hitIndex2 == null) return 1;
else {
cmp = hitIndex1.compareTo(hitIndex2);
if (cmp != 0) return cmp;
}
} else if (hitIndex2 != null) return -1;
return 0;
}

/**
* compare read names lexicographically without any additional tie breakers
*/
public int compareReadNames(final GATKRead read1, final GATKRead read2) {
return read1.getName().compareTo(read2.getName());
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package org.broadinstitute.hellbender.utils.read;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecordQueryNameComparator;
import htsjdk.samtools.SAMTag;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.ReadsDataSource;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.*;


public class ReadQueryNameComparatorUnitTest extends GATKBaseTest {

public static final SAMFileHeader HEADER =ArtificialReadUtils.createArtificialSamHeader();
public static final String NAME = "NAME";

/**
* Tests that the ordering produced by {@link ReadQueryNameComparator} matches queryname ordering
* as produced by htsjdk's {@link SAMRecordQueryNameComparator} for a representative selection of reads. Ignores
* differences in tie-breaking done for reads with the same position -- just asserts that the reads are
* queryname-sorted according to htsjdk, including unmapped reads with and without an assigned position.
*/
@Test
public void testComparatorOrderingMatchesHtsjdkFileOrdering() throws IOException {
final String inputBam = publicTestDir + "org/broadinstitute/hellbender/utils/read/comparator_test_with_unmapped.bam";
final List<GATKRead> reads = new ArrayList<>();
SAMFileHeader header;

try ( final ReadsDataSource readsSource = new ReadsDataSource(IOUtils.getPath(inputBam)) ) {
header = readsSource.getHeader();

for ( GATKRead read : readsSource ) {
reads.add(read);
}
}

// Randomize ordering and then re-sort
Collections.shuffle(reads);
reads.sort(new ReadQueryNameComparator());

final SAMRecordQueryNameComparator samComparator = new SAMRecordQueryNameComparator();
GATKRead previousRead = null;
for ( final GATKRead currentRead : reads ) {
if ( previousRead != null ) {
Assert.assertTrue(samComparator.compare(previousRead.convertToSAMRecord(header), currentRead.convertToSAMRecord(header)) <= 0,
"Reads are out of order: " + previousRead + " and " + currentRead);
}
previousRead = currentRead;
}
}

@DataProvider
public Object[][] getNames(){
return new Object[][]{
{"A", "B", -1},
{"A","A", 0},
{"AA", "A", 1},
{"1","10", -1},
{"2", "10", 1}
};
}



@Test(dataProvider = "getNames")
public void testCompareNames(String firstName, String secondName, int expected) throws Exception {
ReadQueryNameComparator comp = new ReadQueryNameComparator();
GATKRead first = getRead(firstName);
GATKRead second = getRead(secondName);
Assert.assertEquals(comp.compareReadNames(first, second ), expected);
Assert.assertEquals(comp.compareReadNames(second, first), -expected);
Assert.assertEquals(comp.compareReadNames(first, first), 0);
Assert.assertEquals(comp.compareReadNames(second, second), 0);
}

private static GATKRead getRead(String firstName) {
final GATKRead read = ArtificialReadUtils.createArtificialRead(HEADER, firstName, 1, 100, 10);
return read;
}

@DataProvider
public Iterator<Object[]> getReads(){
final GATKRead differentName = getRead(NAME+NAME);

final GATKRead unpaired = getRead(NAME);
unpaired.setIsPaired(false);

final GATKRead paired = getRead(NAME);
paired.setIsPaired(true);

final GATKRead firstOfPair = getRead(NAME);
firstOfPair.setIsFirstOfPair();

final GATKRead secondOfPair = getRead(NAME);
secondOfPair.setIsSecondOfPair();

final GATKRead reverseStrand = getRead(NAME);
reverseStrand.setIsReverseStrand(true);

final GATKRead supplementary = getRead(NAME);
supplementary.setIsSupplementaryAlignment(true);

final GATKRead secondary = getRead(NAME);
secondary.setIsSecondaryAlignment(true);

final GATKRead tagHI1 = getRead(NAME);
tagHI1.setAttribute(SAMTag.HI.name(), 1);

final GATKRead tagHI2 = getRead(NAME);
tagHI2.setAttribute(SAMTag.HI.name(), 2);

List<GATKRead> reads = Arrays.asList(differentName, unpaired, paired, firstOfPair, secondOfPair, reverseStrand, supplementary, secondary, tagHI1, tagHI2);
List<Object[]> tests = new ArrayList<>();
for(GATKRead left: reads){
for(GATKRead right: reads){
tests.add(new Object[]{left, right});
}
};
return tests.iterator();
}

@Test(dataProvider = "getReads")
public void testTieBreakers(GATKRead left, GATKRead right){
ReadQueryNameComparator readComparator = new ReadQueryNameComparator();
SAMRecordQueryNameComparator samComparator = new SAMRecordQueryNameComparator();
Assert.assertEquals(readComparator.compare(left, right), samComparator.compare(left.convertToSAMRecord(HEADER), right.convertToSAMRecord(HEADER)));
}

}

0 comments on commit ac34d0b

Please sign in to comment.