-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding soft clipped ratio to OverclippedReadFilter #5995
Merged
Merged
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
09d66d5
Added in capability to filter based on ratio of soft clipped bases.
jonn-smith 8a5a370
Added in filter for leading/trailing soft clips only.
jonn-smith 9e3f0c2
Refactored filter into new class: SoftClippedReadFilter
jonn-smith a157452
Addressing round 2 comments.
jonn-smith 019f990
Fixing silly problem.
jonn-smith File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
127 changes: 127 additions & 0 deletions
127
src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import htsjdk.samtools.CigarElement; | ||
import htsjdk.samtools.CigarOperator; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.help.HelpConstants; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
|
||
/** | ||
* Filter out reads where the ratio of soft-clipped bases to total bases exceeds some given value. | ||
* | ||
* Can choose to filter by number of soft-clipped bases, or by the leading/trailing soft-clipped bases only. | ||
*/ | ||
@DocumentedFeature( | ||
groupName=HelpConstants.DOC_CAT_READFILTERS, | ||
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, | ||
summary = "Filter out reads that are over-soft-clipped") | ||
public final class SoftClippedReadFilter extends ReadFilter { | ||
static final long serialVersionUID = 1L; | ||
private final Logger logger = LogManager.getLogger(this.getClass()); | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.INVERT_FILTER, | ||
doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.", | ||
optional = true | ||
) | ||
boolean doInvertFilter = false; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD, | ||
doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.", | ||
optional = true, | ||
mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD } | ||
) | ||
Double minimumSoftClippedRatio = null; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD, | ||
doc = "Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.", | ||
optional = true, | ||
mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD} | ||
) | ||
Double minimumLeadingTrailingSoftClippedRatio = null; | ||
|
||
// Command line parser requires a no-arg constructor | ||
public SoftClippedReadFilter() {} | ||
|
||
private boolean testMinSoftClippedRatio(final GATKRead read) { | ||
int totalLength = 0; | ||
int numSoftClippedBases = 0; | ||
for ( final CigarElement element : read.getCigarElements() ) { | ||
if (element.getOperator() == CigarOperator.S) { | ||
numSoftClippedBases += element.getLength(); | ||
} | ||
totalLength += element.getLength(); | ||
} | ||
|
||
final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength); | ||
|
||
return softClipRatio > minimumSoftClippedRatio; | ||
} | ||
|
||
private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) { | ||
|
||
if ( read.getCigarElements().size() < 1 ) { | ||
return false; | ||
} | ||
|
||
// Get the index of the last cigar element: | ||
final int lastCigarOpIndex = read.getCigarElements().size() - 1; | ||
|
||
// Calculate the number of bases here: | ||
// Note: we don't want to double-count if the read has only 1 cigar operator. | ||
final int numLeadingTrailingSoftClippedBases = | ||
(read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0) | ||
+ | ||
((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S) | ||
? read.getCigarElement(lastCigarOpIndex).getLength() | ||
: 0); | ||
|
||
// Determine length: | ||
final int totalLength = read.getCigarElements().stream() | ||
.mapToInt( CigarElement::getLength ) | ||
.sum(); | ||
|
||
// Calculate the ratio: | ||
final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength); | ||
|
||
return softClipRatio > minimumLeadingTrailingSoftClippedRatio; | ||
} | ||
|
||
@Override | ||
public boolean test(final GATKRead read) { | ||
|
||
final boolean result; | ||
|
||
// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they | ||
// have been specified. If they have, that's the filter logic we're using. | ||
// If we specified the clipping ratio, we use the min sequence length test: | ||
if ( minimumSoftClippedRatio != null ) { | ||
result = testMinSoftClippedRatio(read); | ||
} | ||
// If we specified the leading/trailing clipping ratio, we use the min sequence length test: | ||
else if ( minimumLeadingTrailingSoftClippedRatio != null ) { | ||
result = testMinLeadingTrailingSoftClippedRatio(read); | ||
} | ||
else { | ||
throw new UserException( | ||
"Must provide one of the following arguments: " + | ||
ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD + "," + | ||
ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD | ||
); | ||
} | ||
|
||
// Check for if we want to invert our results: | ||
if ( doInvertFilter ) { | ||
return !result; | ||
} | ||
return result; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
139 changes: 139 additions & 0 deletions
139
...test/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilterUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import htsjdk.samtools.Cigar; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.TextCigarCodec; | ||
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.testng.Assert; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
import java.util.Iterator; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
|
||
/** | ||
* Unit tests for the {@link SoftClippedReadFilter} class. | ||
* Created by jonn on 6/13/19. | ||
*/ | ||
public class SoftClippedReadFilterUnitTest { | ||
|
||
private static final int CHR_COUNT = 1; | ||
private static final int CHR_START = 1; | ||
private static final int CHR_SIZE = 1000; | ||
private static final int GROUP_COUNT = 5; | ||
|
||
private final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); | ||
|
||
private GATKRead buildSAMRead(final String cigarString) { | ||
final Cigar cigar = TextCigarCodec.decode(cigarString); | ||
return ArtificialReadUtils.createArtificialRead(header, cigar); | ||
} | ||
|
||
@Test(dataProvider= "SoftClipRatioDataProvider") | ||
public void testOverclippedSoftClipRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final SoftClippedReadFilter filter = new SoftClippedReadFilter(); | ||
filter.minimumSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
|
||
filter.doInvertFilter = true; | ||
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); | ||
} | ||
|
||
@Test(dataProvider= "SoftClippedLeadingTrailingRatioDataProvider") | ||
public void testSoftClippedLeadingTrailingRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final SoftClippedReadFilter filter = new SoftClippedReadFilter(); | ||
filter.minimumLeadingTrailingSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
|
||
filter.doInvertFilter = true; | ||
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); | ||
} | ||
|
||
@DataProvider(name = "SoftClipRatioDataProvider") | ||
public Iterator<Object[]> softClipRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
@DataProvider(name = "SoftClippedLeadingTrailingRatioDataProvider") | ||
public Iterator<Object[]> softClippedLeadingTrailingRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
// Non-leading/-trailing | ||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280 | ||
|
||
// Leading: | ||
testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280 | ||
|
||
// Trailing: | ||
testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would make this "invert-soft-clip-ratio" or something like that to make it a little clearer that its associated with THIS filter not just any filter on the command line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure. Really, this should be a feature of every reader. I'll add a ticket for it.