-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding soft clipped ratio to OverclippedReadFilter #5995
Changes from 2 commits
09d66d5
8a5a370
9e3f0c2
a157452
019f990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import htsjdk.samtools.CigarElement; | ||
import htsjdk.samtools.CigarOperator; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; | ||
|
@@ -20,34 +23,58 @@ | |
* <p>Note: Consecutive soft-clipped blocks are treated as a single block. For example, 1S2S10M1S2S is treated as 3S10M3S</p> | ||
*/ | ||
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_READFILTERS, groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filter out reads that are over-soft-clipped") | ||
public final class OverclippedReadFilter extends ReadFilter{ | ||
|
||
public final class OverclippedReadFilter extends ReadFilter { | ||
static final long serialVersionUID = 1L; | ||
private final Logger logger = LogManager.getLogger(this.getClass()); | ||
|
||
private static final double DEFAULT_MIN_SOFT_CLIPPED_RATIO = Double.MIN_VALUE; | ||
|
||
@Argument(fullName = ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME, | ||
doc = "Minimum number of aligned bases", | ||
optional = true) | ||
public int minimumSequenceLength = 30; | ||
private int minAlignedBases = 30; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = "soft-clipped-ratio-threshold", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move these names to constants in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
doc = "Minimum ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be included.", | ||
optional = true, | ||
mutex = {ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME, "leading-trailing-soft-clipped-ratio"} | ||
) | ||
double minimumSoftClippedRatio = DEFAULT_MIN_SOFT_CLIPPED_RATIO; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = "leading-trailing-soft-clipped-ratio", | ||
doc = "Minimum ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be included.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any reason to necessarily make these ("leading-trailing-soft-clipped-ratio", "soft-clipped-ratio-threshold") mutually exclusive? It seems like it may be possible that one cares about different values for one or the other. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my use case there is never a time when you want to do a combined filter on both. I don't think it's useful to have them both on at the same time. We can fix it if someone ever wants to make this happen together. |
||
optional = true, | ||
mutex = {ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME, "soft-clipped-ratio-threshold"} | ||
) | ||
double minimumLeadingTrailingSoftClippedRatio = DEFAULT_MIN_SOFT_CLIPPED_RATIO; | ||
|
||
@Argument(fullName = ReadFilterArgumentDefinitions.DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME, | ||
doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must " + | ||
"have a soft-clipped block, setting this flag requires only 1 soft-clipped block", | ||
optional = true) | ||
public boolean doNotRequireSoftClipsOnBothEnds; | ||
optional = true, | ||
mutex = {"soft-clipped-ratio-threshold", "leading-trailing-soft-clipped-ratio"}) | ||
private boolean doNotRequireSoftClipsOnBothEnds; | ||
|
||
// Command line parser requires a no-arg constructor | ||
public OverclippedReadFilter() {} | ||
|
||
public OverclippedReadFilter(final int minimumSequenceLength, final boolean doNotRequireSoftClipsOnBothEnds) { | ||
this.minimumSequenceLength = minimumSequenceLength; | ||
@VisibleForTesting | ||
OverclippedReadFilter(final int minAlignedBases) { | ||
this.minAlignedBases = minAlignedBases; | ||
} | ||
|
||
@VisibleForTesting | ||
OverclippedReadFilter(final int minAlignedBases, final boolean doNotRequireSoftClipsOnBothEnds) { | ||
this.minAlignedBases = minAlignedBases; | ||
this.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds; | ||
} | ||
|
||
@Override | ||
public boolean test(final GATKRead read) { | ||
private boolean testMinSequenceLength(final GATKRead read) { | ||
int alignedLength = 0; | ||
int softClipBlocks = 0; | ||
int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2; | ||
final int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2; | ||
CigarOperator prevOperator = null; | ||
|
||
for ( final CigarElement element : read.getCigarElements() ) { | ||
|
@@ -63,6 +90,92 @@ public boolean test(final GATKRead read) { | |
prevOperator = element.getOperator(); | ||
} | ||
|
||
return(alignedLength >= minimumSequenceLength || softClipBlocks < minSoftClipBlocks); | ||
return (alignedLength >= minAlignedBases || softClipBlocks < minSoftClipBlocks); | ||
} | ||
|
||
private boolean testMinSoftClippedRatio(final GATKRead read) { | ||
int totalLength = 0; | ||
int alignedLength = 0; | ||
int numSoftClippedBases = 0; | ||
for ( final CigarElement element : read.getCigarElements() ) { | ||
if (element.getOperator() == CigarOperator.S) { | ||
numSoftClippedBases += element.getLength(); | ||
} else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above) | ||
alignedLength += element.getLength(); | ||
} | ||
|
||
totalLength += element.getLength(); | ||
} | ||
|
||
final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength); | ||
|
||
logger.debug( " Num Soft Clipped Bases: " + numSoftClippedBases ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, use suppliers or hide them behind if statements to prevent unnecessary method calls. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Debug statements are getting removed. |
||
logger.debug( " Read Length: " + read.getLength() ); | ||
logger.debug( " Read Filter status: (" + softClipRatio + " > " + minimumSoftClippedRatio + ")" ); | ||
logger.debug( " Read Filter status: " + (softClipRatio > minimumSoftClippedRatio) ); | ||
logger.debug( " Aligned Length: " + alignedLength ); | ||
|
||
return (alignedLength >= minAlignedBases) && (softClipRatio > minimumSoftClippedRatio); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should either be || or you should drop the first clause There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OBE |
||
} | ||
|
||
private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) { | ||
|
||
if ( read.getCigarElements().size() < 1 ) { | ||
return false; | ||
} | ||
|
||
// Get the index of the last cigar element: | ||
final int lastCigarOpIndex = read.getCigarElements().size() - 1; | ||
|
||
// Calculate the number of bases here: | ||
// Note: we don't want to double-count if the read has only 1 cigar operator. | ||
final int numLeadingTrailingSoftClippedBases = | ||
(read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0) | ||
+ | ||
((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S) | ||
? read.getCigarElement(lastCigarOpIndex).getLength() | ||
: 0); | ||
|
||
// Determine lengths: | ||
int totalLength = 0; | ||
int alignedLength = 0; | ||
for ( final CigarElement element : read.getCigarElements() ) { | ||
if ( (element.getOperator() != CigarOperator.S) && element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above) | ||
alignedLength += element.getLength(); | ||
} | ||
totalLength += element.getLength(); | ||
} | ||
|
||
// Calculate the ratio: | ||
final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength); | ||
|
||
logger.debug( " Num Leading / Trailing Soft Clipped Bases: " + numLeadingTrailingSoftClippedBases ); | ||
logger.debug( " Read Length: " + read.getLength() ); | ||
logger.debug( " Read Filter status: (" + softClipRatio + " > " + minimumLeadingTrailingSoftClippedRatio + ")" ); | ||
logger.debug( " Read Filter status: " + (softClipRatio > minimumLeadingTrailingSoftClippedRatio) ); | ||
logger.debug( " Aligned Length: " + alignedLength ); | ||
|
||
return (alignedLength >= minAlignedBases) && (softClipRatio > minimumLeadingTrailingSoftClippedRatio); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If these arguments are really mutually exclusive I wouldn't have both options (minAlignedBases, and minimumLeadingTrailingSoftClippedRatio) apply in every case. Either let them both exist at once (as they are all 3 separate tests). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OBE |
||
} | ||
|
||
@Override | ||
public boolean test(final GATKRead read) { | ||
logger.debug( "About to test read: " + read.toString()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This debug statement is going to be expensive, as it has to call read.toString() for every read which means every read gets parsed completely even if debug is off. Either use the message supplier paradigm or encase this in an if statement. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to remove all debug logging statements anyway - they were in for... well.. debugging. |
||
|
||
// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a line about this behavior to the javadoc for this filter as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
// have been specified. If they have, that's the filter logic we're using. | ||
|
||
// If we specified the clipping ratio, we use the min sequence length test: | ||
if ( minimumSoftClippedRatio != DEFAULT_MIN_SOFT_CLIPPED_RATIO ) { | ||
return testMinSoftClippedRatio(read); | ||
} | ||
// If we specified the leading/trailing clipping ratio, we use the min sequence length test: | ||
if ( minimumLeadingTrailingSoftClippedRatio != DEFAULT_MIN_SOFT_CLIPPED_RATIO ) { | ||
return testMinLeadingTrailingSoftClippedRatio(read); | ||
} | ||
// Default condition is to test against sequence length: | ||
else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say remove this else statement so it's a bit clearer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OBE - new class, so no need for this else. |
||
return testMinSequenceLength(read); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,22 +24,131 @@ public class OverclippedReadFilterUnitTest extends GATKBaseTest { | |
|
||
private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); | ||
|
||
private GATKRead buildSAMRead(final String cigarString) { | ||
final Cigar cigar = TextCigarCodec.decode(cigarString); | ||
return ArtificialReadUtils.createArtificialRead(header, cigar); | ||
} | ||
|
||
@Test(dataProvider= "OverclippedDataProvider") | ||
public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftClipsOnBothEnds, final boolean expectedResult) { | ||
public void testOverclippedFilter(final String cigarString, | ||
final boolean doNotRequireSoftClipsOnBothEnds, | ||
final boolean expectedResult) { | ||
|
||
final OverclippedReadFilter filter = new OverclippedReadFilter(30, false); | ||
filter.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds; | ||
final OverclippedReadFilter filter = new OverclippedReadFilter(30, doNotRequireSoftClipsOnBothEnds); | ||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
} | ||
|
||
private GATKRead buildSAMRead(final String cigarString) { | ||
final Cigar cigar = TextCigarCodec.decode(cigarString); | ||
return ArtificialReadUtils.createArtificialRead(header, cigar); | ||
@Test(dataProvider= "OverclippedSoftClipRatioDataProvider") | ||
public void testOverclippedSoftClipRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final OverclippedReadFilter filter = new OverclippedReadFilter(10); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks |
||
filter.minimumSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
} | ||
|
||
@Test(dataProvider= "OverclippedLeadingTrailingSoftClipRatioDataProvider") | ||
public void testOverclippedLeadingTrailingSoftClipRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final OverclippedReadFilter filter = new OverclippedReadFilter(10); | ||
filter.minimumLeadingTrailingSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
} | ||
|
||
@DataProvider(name = "OverclippedSoftClipRatioDataProvider") | ||
public Iterator<Object[]> overclippedSoftClipRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Min bases test: | ||
testData.add(new Object[] { "20S5M", 0.2, false }); // 20/25 = .8 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think these tests actually demonstrate the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They were showing that strings below 30 bases will return false even though by the other metric they would pass. The other tests have a smaller minAlignedBases and all pass because the threshold has been hit. These tests have since been removed anyway because of the new class structure. |
||
testData.add(new Object[] { "20S20M", 0.2, true }); // 20/40 = .5 | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
@DataProvider(name = "OverclippedLeadingTrailingSoftClipRatioDataProvider") | ||
public Iterator<Object[]> overclippedLeadingTrailingSoftClipRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Min bases test: | ||
testData.add(new Object[] { "20S5M", 0.2, false }); // 20/25 = .8 | ||
testData.add(new Object[] { "20S20M", 0.2, true }); // 20/40 = .5 | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
// Non-leading/-trailing | ||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280 | ||
|
||
// Leading: | ||
testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280 | ||
|
||
// Trailing: | ||
testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
@DataProvider(name= "OverclippedDataProvider") | ||
@DataProvider(name = "OverclippedDataProvider") | ||
public Iterator<Object[]> overclippedDataProvider() { | ||
final List<Object[]> result = new LinkedList<>(); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These default values get filled in on the documentation page here: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_engine_filters_OverclippedReadFilter.php meaning that you will get some godawful representation of Double.MIN_VALUE filled in on this page. I would suggest either choosing a more manageable default value like 1.0, or setting this to a
Double
object and making the default valuenull
so it shows up as such and is not confusing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Double
/null
it is!