Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding soft clipped ratio to OverclippedReadFilter #5995

Merged
merged 5 commits into from
Jun 14, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,9 @@ private ReadFilterArgumentDefinitions(){}
public static final String SAMPLE_NAME = "sample";

public static final String KEEP_INTERVAL_NAME = "keep-intervals";

public static final String SOFT_CLIPPED_RATIO_THRESHOLD = "soft-clipped-ratio-threshold";
public static final String SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD = "soft-clipped-leading-trailing-ratio";

public static final String INVERT_FILTER = "invert-filter";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make this "invert-soft-clip-ratio" or something like that to make it a little clearer that its associated with THIS filter not just any filter on the command line

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Really, this should be a feature of every reader. I'll add a ticket for it.

}
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
Expand All @@ -20,34 +23,41 @@
* <p>Note: Consecutive soft-clipped blocks are treated as a single block. For example, 1S2S10M1S2S is treated as 3S10M3S</p>
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_READFILTERS, groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filter out reads that are over-soft-clipped")
public final class OverclippedReadFilter extends ReadFilter{

public final class OverclippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;
private final Logger logger = LogManager.getLogger(this.getClass());

@Argument(fullName = ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME,
doc = "Minimum number of aligned bases",
optional = true)
public int minimumSequenceLength = 30;
private int minAlignedBases = 30;

@Argument(fullName = ReadFilterArgumentDefinitions.DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME,
doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must " +
"have a soft-clipped block, setting this flag requires only 1 soft-clipped block",
optional = true)
public boolean doNotRequireSoftClipsOnBothEnds;
optional = true,
mutex = {"soft-clipped-ratio-threshold", "leading-trailing-soft-clipped-ratio"})
private boolean doNotRequireSoftClipsOnBothEnds;

// Command line parser requires a no-arg constructor
public OverclippedReadFilter() {}

public OverclippedReadFilter(final int minimumSequenceLength, final boolean doNotRequireSoftClipsOnBothEnds) {
this.minimumSequenceLength = minimumSequenceLength;
@VisibleForTesting
OverclippedReadFilter(final int minAlignedBases) {
this.minAlignedBases = minAlignedBases;
}

@VisibleForTesting
OverclippedReadFilter(final int minAlignedBases, final boolean doNotRequireSoftClipsOnBothEnds) {
this.minAlignedBases = minAlignedBases;
this.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds;
}

@Override
public boolean test(final GATKRead read) {
int alignedLength = 0;
int softClipBlocks = 0;
int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2;
final int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2;
CigarOperator prevOperator = null;

for ( final CigarElement element : read.getCigarElements() ) {
Expand All @@ -63,6 +73,6 @@ public boolean test(final GATKRead read) {
prevOperator = element.getOperator();
}

return(alignedLength >= minimumSequenceLength || softClipBlocks < minSoftClipBlocks);
return (alignedLength >= minAlignedBases || softClipBlocks < minSoftClipBlocks);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.GATKRead;

/**
* Filter out reads where the ratio of soft-clipped bases to total bases exceeds some given value.
*
* Can choose to filter by number of soft-clipped bases, or by the leading/trailing soft-clipped bases only.
*/
@DocumentedFeature(
groupName=HelpConstants.DOC_CAT_READFILTERS,
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY,
summary = "Filter out reads that are over-soft-clipped")
public final class SoftClippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;
private final Logger logger = LogManager.getLogger(this.getClass());

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.INVERT_FILTER,
doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.",
optional = true
)
boolean doInvertFilter = false;

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD,
doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.",
optional = true,
mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD }
)
Double minimumSoftClippedRatio = null;

@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD,
doc = "Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.",
optional = true,
mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD}
)
Double minimumLeadingTrailingSoftClippedRatio = null;

// Command line parser requires a no-arg constructor
public SoftClippedReadFilter() {}

private boolean testMinSoftClippedRatio(final GATKRead read) {
int totalLength = 0;
int numSoftClippedBases = 0;
for ( final CigarElement element : read.getCigarElements() ) {
if (element.getOperator() == CigarOperator.S) {
numSoftClippedBases += element.getLength();
}
totalLength += element.getLength();
}

final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength);

return softClipRatio > minimumSoftClippedRatio;
}

private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) {

if ( read.getCigarElements().size() < 1 ) {
return false;
}

// Get the index of the last cigar element:
final int lastCigarOpIndex = read.getCigarElements().size() - 1;

// Calculate the number of bases here:
// Note: we don't want to double-count if the read has only 1 cigar operator.
final int numLeadingTrailingSoftClippedBases =
(read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0)
+
((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S)
? read.getCigarElement(lastCigarOpIndex).getLength()
: 0);

// Determine length:
final int totalLength = read.getCigarElements().stream()
.mapToInt( CigarElement::getLength )
.sum();

// Calculate the ratio:
final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength);

return softClipRatio > minimumLeadingTrailingSoftClippedRatio;
}

@Override
public boolean test(final GATKRead read) {

final boolean result;

// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they
// have been specified. If they have, that's the filter logic we're using.
// If we specified the clipping ratio, we use the min sequence length test:
if ( minimumSoftClippedRatio != null ) {
result = testMinSoftClippedRatio(read);
}
// If we specified the leading/trailing clipping ratio, we use the min sequence length test:
else if ( minimumLeadingTrailingSoftClippedRatio != null ) {
result = testMinLeadingTrailingSoftClippedRatio(read);
}
else {
throw new UserException(
"Must provide one of the following arguments: " +
ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD + "," +
ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD
);
}

// Check for if we want to invert our results:
if ( doInvertFilter ) {
return !result;
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,22 @@ public class OverclippedReadFilterUnitTest extends GATKBaseTest {

private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT);

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@Test(dataProvider= "OverclippedDataProvider")
public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftClipsOnBothEnds, final boolean expectedResult) {
public void testOverclippedFilter(final String cigarString,
final boolean doNotRequireSoftClipsOnBothEnds,
final boolean expectedResult) {

final OverclippedReadFilter filter = new OverclippedReadFilter(30, false);
filter.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds;
final OverclippedReadFilter filter = new OverclippedReadFilter(30, doNotRequireSoftClipsOnBothEnds);
final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);
}

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@DataProvider(name= "OverclippedDataProvider")
@DataProvider(name = "OverclippedDataProvider")
public Iterator<Object[]> overclippedDataProvider() {
final List<Object[]> result = new LinkedList<>();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package org.broadinstitute.hellbender.engine.filters;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.TextCigarCodec;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* Unit tests for the {@link SoftClippedReadFilter} class.
* Created by jonn on 6/13/19.
*/
public class SoftClippedReadFilterUnitTest {

private static final int CHR_COUNT = 1;
private static final int CHR_START = 1;
private static final int CHR_SIZE = 1000;
private static final int GROUP_COUNT = 5;

private final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT);

private GATKRead buildSAMRead(final String cigarString) {
final Cigar cigar = TextCigarCodec.decode(cigarString);
return ArtificialReadUtils.createArtificialRead(header, cigar);
}

@Test(dataProvider= "SoftClipRatioDataProvider")
public void testOverclippedSoftClipRatioFilter(final String cigarString,
final double clipRatio,
final boolean expectedResult) {

final SoftClippedReadFilter filter = new SoftClippedReadFilter();
filter.minimumSoftClippedRatio = clipRatio;

final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);

filter.doInvertFilter = true;
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString);
}

@Test(dataProvider= "SoftClippedLeadingTrailingRatioDataProvider")
public void testSoftClippedLeadingTrailingRatioFilter(final String cigarString,
final double clipRatio,
final boolean expectedResult) {

final SoftClippedReadFilter filter = new SoftClippedReadFilter();
filter.minimumLeadingTrailingSoftClippedRatio = clipRatio;

final GATKRead read = buildSAMRead(cigarString);
Assert.assertEquals(filter.test(read), expectedResult, cigarString);

filter.doInvertFilter = true;
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString);
}

@DataProvider(name = "SoftClipRatioDataProvider")
public Iterator<Object[]> softClipRatioDataProvider() {
final List<Object[]> testData = new LinkedList<>();

// ---------------------------------------
// Null / trivial cases:
testData.add(new Object[] { "", 0.1, false });
testData.add(new Object[] { "10H", 0.1, false });

// ---------------------------------------
// Soft clip ratio test:

testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280

// ---------------------------------------
// Soft clip placement:

testData.add(new Object[] { "101S100M", 0.5, true });
testData.add(new Object[] { "100M101S", 0.5, true });
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true });

return testData.iterator();
}

@DataProvider(name = "SoftClippedLeadingTrailingRatioDataProvider")
public Iterator<Object[]> softClippedLeadingTrailingRatioDataProvider() {
final List<Object[]> testData = new LinkedList<>();

// ---------------------------------------
// Null / trivial cases:
testData.add(new Object[] { "", 0.1, false });
testData.add(new Object[] { "10H", 0.1, false });

// ---------------------------------------
// Soft clip ratio test:

// Non-leading/-trailing
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217
testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250
testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280

// Leading:
testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280

// Trailing:
testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100
testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143
testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182
testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217
testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250
testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280

// ---------------------------------------
// Soft clip placement:

testData.add(new Object[] { "101S100M", 0.5, true });
testData.add(new Object[] { "100M101S", 0.5, true });
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false });

return testData.iterator();
}

}