Skip to content

Commit

Permalink
Adding soft clipped ratio read filter - SoftClippedReadFilter (#5995)
Browse files Browse the repository at this point in the history
* Created `SoftClippedReadFilter`
* Added in capability to filter based on ratio of soft clipped bases.
* Added in filter for leading/trailing soft clips only.
  • Loading branch information
jonn-smith authored Jun 14, 2019
1 parent 730164d commit 744987b
Show file tree
Hide file tree
Showing 5 changed files with 297 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,9 @@ private ReadFilterArgumentDefinitions(){}
public static final String SAMPLE_NAME = "sample";

public static final String KEEP_INTERVAL_NAME = "keep-intervals";

public static final String SOFT_CLIPPED_RATIO_THRESHOLD = "soft-clipped-ratio-threshold";
public static final String SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD = "soft-clipped-leading-trailing-ratio";

public static final String INVERT_SOFT_CLIP_RATIO_FILTER = "invert-soft-clip-ratio-filter";
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
Expand All @@ -20,34 +23,40 @@
* <p>Note: Consecutive soft-clipped blocks are treated as a single block. For example, 1S2S10M1S2S is treated as 3S10M3S</p>
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_READFILTERS, groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filter out reads that are over-soft-clipped")
public final class OverclippedReadFilter extends ReadFilter{

public final class OverclippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;
private final Logger logger = LogManager.getLogger(this.getClass());

@Argument(fullName = ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME,
doc = "Minimum number of aligned bases",
optional = true)
public int minimumSequenceLength = 30;
private int minAlignedBases = 30;

@Argument(fullName = ReadFilterArgumentDefinitions.DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME,
doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must " +
"have a soft-clipped block, setting this flag requires only 1 soft-clipped block",
optional = true)
public boolean doNotRequireSoftClipsOnBothEnds;
private boolean doNotRequireSoftClipsOnBothEnds;

// Command line parser requires a no-arg constructor
public OverclippedReadFilter() {}

public OverclippedReadFilter(final int minimumSequenceLength, final boolean doNotRequireSoftClipsOnBothEnds) {
this.minimumSequenceLength = minimumSequenceLength;
@VisibleForTesting
OverclippedReadFilter(final int minAlignedBases) {
this.minAlignedBases = minAlignedBases;
}

@VisibleForTesting
OverclippedReadFilter(final int minAlignedBases, final boolean doNotRequireSoftClipsOnBothEnds) {
this.minAlignedBases = minAlignedBases;
this.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds;
}

@Override
public boolean test(final GATKRead read) {
int alignedLength = 0;
int softClipBlocks = 0;
int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2;
final int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2;
CigarOperator prevOperator = null;

for ( final CigarElement element : read.getCigarElements() ) {
Expand All @@ -63,6 +72,6 @@ public boolean test(final GATKRead read) {
prevOperator = element.getOperator();
}

return(alignedLength >= minimumSequenceLength || softClipBlocks < minSoftClipBlocks);
return (alignedLength >= minAlignedBases || softClipBlocks < minSoftClipBlocks);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.GATKRead;

/**
* Filter out reads where the ratio of soft-clipped bases to total bases exceeds some given value.
*
* Can choose to filter by number of soft-clipped bases, or by the leading/trailing soft-clipped bases only.
*/
@DocumentedFeature(
groupName=HelpConstants.DOC_CAT_READFILTERS,
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY,
summary = "Filter out reads that are over-soft-clipped")
public final class SoftClippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;
private final Logger logger = LogManager.getLogger(this.getClass());

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.INVERT_SOFT_CLIP_RATIO_FILTER,
doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.",
optional = true
)
boolean doInvertFilter = false;

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD,
doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.",
optional = true,
mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD }
)
Double minimumSoftClippedRatio = null;

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD,
doc = "Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.",
optional = true,
mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD}
)
Double minimumLeadingTrailingSoftClippedRatio = null;

// Command line parser requires a no-arg constructor
public SoftClippedReadFilter() {}

private boolean testMinSoftClippedRatio(final GATKRead read) {
int totalLength = 0;
int numSoftClippedBases = 0;
for ( final CigarElement element : read.getCigarElements() ) {
if (element.getOperator() == CigarOperator.S) {
numSoftClippedBases += element.getLength();
}
totalLength += element.getLength();
}

final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength);

return softClipRatio > minimumSoftClippedRatio;
}

private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) {

if ( read.getCigarElements().size() < 1 ) {
return false;
}

// Get the index of the last cigar element:
final int lastCigarOpIndex = read.getCigarElements().size() - 1;

// Calculate the number of bases here:
// Note: we don't want to double-count if the read has only 1 cigar operator.
final int numLeadingTrailingSoftClippedBases =
(read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0)
+
((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S)
? read.getCigarElement(lastCigarOpIndex).getLength()
: 0);

// Determine length:
final int totalLength = read.getCigarElements().stream()
.mapToInt( CigarElement::getLength )
.sum();

// Calculate the ratio:
final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength);

return softClipRatio > minimumLeadingTrailingSoftClippedRatio;
}

@Override
public boolean test(final GATKRead read) {

final boolean result;

// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they
// have been specified. If they have, that's the filter logic we're using.
// If we specified the clipping ratio, we use the min sequence length test:
if ( minimumSoftClippedRatio != null ) {
result = testMinSoftClippedRatio(read);
}
// If we specified the leading/trailing clipping ratio, we use the min sequence length test:
else if ( minimumLeadingTrailingSoftClippedRatio != null ) {
result = testMinLeadingTrailingSoftClippedRatio(read);
}
else {
throw new UserException(
"Must provide one of the following arguments: " +
ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD + "," +
ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD
);
}

// Check for if we want to invert our results:
if ( doInvertFilter ) {
return !result;
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ public class OverclippedReadFilterUnitTest extends GATKBaseTest {

private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT);

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@Test(dataProvider= "OverclippedDataProvider")
public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftClipsOnBothEnds, final boolean expectedResult) {
public void testOverclippedFilter(final String cigarString,
final boolean doNotRequireSoftClipsOnBothEnds,
final boolean expectedResult) {

final OverclippedReadFilter filter = new OverclippedReadFilter(30, false);
filter.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds;
final OverclippedReadFilter filter = new OverclippedReadFilter(30, doNotRequireSoftClipsOnBothEnds);
final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);
}

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@DataProvider(name= "OverclippedDataProvider")
@DataProvider(name = "OverclippedDataProvider")
public Iterator<Object[]> overclippedDataProvider() {
final List<Object[]> result = new LinkedList<>();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package org.broadinstitute.hellbender.engine.filters;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* Unit tests for the {@link SoftClippedReadFilter} class.
* Created by jonn on 6/13/19.
*/
public class SoftClippedReadFilterUnitTest {

private static final int CHR_COUNT = 1;
private static final int CHR_START = 1;
private static final int CHR_SIZE = 1000;
private static final int GROUP_COUNT = 5;

private final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT);

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@Test(dataProvider= "SoftClipRatioDataProvider")
public void testOverclippedSoftClipRatioFilter(final String cigarString,
final double clipRatio,
final boolean expectedResult) {

final SoftClippedReadFilter filter = new SoftClippedReadFilter();
filter.minimumSoftClippedRatio = clipRatio;

final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);

filter.doInvertFilter = true;
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString);
}

@Test(dataProvider= "SoftClippedLeadingTrailingRatioDataProvider")
public void testSoftClippedLeadingTrailingRatioFilter(final String cigarString,
final double clipRatio,
final boolean expectedResult) {

final SoftClippedReadFilter filter = new SoftClippedReadFilter();
filter.minimumLeadingTrailingSoftClippedRatio = clipRatio;

final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);

filter.doInvertFilter = true;
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString);
}

@DataProvider(name = "SoftClipRatioDataProvider")
public Iterator<Object[]> softClipRatioDataProvider() {
final List<Object[]> testData = new LinkedList<>();

// ---------------------------------------
// Null / trivial cases:
testData.add(new Object[] { "", 0.1, false });
testData.add(new Object[] { "10H", 0.1, false });

// ---------------------------------------
// Soft clip ratio test:

testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280

// ---------------------------------------
// Soft clip placement:

testData.add(new Object[] { "101S100M", 0.5, true });
testData.add(new Object[] { "100M101S", 0.5, true });
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true });

return testData.iterator();
}

@DataProvider(name = "SoftClippedLeadingTrailingRatioDataProvider")
public Iterator<Object[]> softClippedLeadingTrailingRatioDataProvider() {
final List<Object[]> testData = new LinkedList<>();

// ---------------------------------------
// Null / trivial cases:
testData.add(new Object[] { "", 0.1, false });
testData.add(new Object[] { "10H", 0.1, false });

// ---------------------------------------
// Soft clip ratio test:

// Non-leading/-trailing
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217
testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250
testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280

// Leading:
testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280

// Trailing:
testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280

// ---------------------------------------
// Soft clip placement:

testData.add(new Object[] { "101S100M", 0.5, true });
testData.add(new Object[] { "100M101S", 0.5, true });
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false });

return testData.iterator();
}

}

0 comments on commit 744987b

Please sign in to comment.