-
Notifications
You must be signed in to change notification settings - Fork 597
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding soft clipped ratio read filter - SoftClippedReadFilter (#5995)
* Created `SoftClippedReadFilter` * Added in capability to filter based on ratio of soft clipped bases. * Added in filter for leading/trailing soft clips only.
- Loading branch information
1 parent
730164d
commit 744987b
Showing
5 changed files
with
297 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
127 changes: 127 additions & 0 deletions
127
src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import htsjdk.samtools.CigarElement; | ||
import htsjdk.samtools.CigarOperator; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.help.DocumentedFeature; | ||
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.help.HelpConstants; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
|
||
/** | ||
* Filter out reads where the ratio of soft-clipped bases to total bases exceeds some given value. | ||
* | ||
* Can choose to filter by number of soft-clipped bases, or by the leading/trailing soft-clipped bases only. | ||
*/ | ||
@DocumentedFeature( | ||
groupName=HelpConstants.DOC_CAT_READFILTERS, | ||
groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, | ||
summary = "Filter out reads that are over-soft-clipped") | ||
public final class SoftClippedReadFilter extends ReadFilter { | ||
static final long serialVersionUID = 1L; | ||
private final Logger logger = LogManager.getLogger(this.getClass()); | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.INVERT_SOFT_CLIP_RATIO_FILTER, | ||
doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.", | ||
optional = true | ||
) | ||
boolean doInvertFilter = false; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD, | ||
doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.", | ||
optional = true, | ||
mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD } | ||
) | ||
Double minimumSoftClippedRatio = null; | ||
|
||
@VisibleForTesting | ||
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD, | ||
doc = "Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.", | ||
optional = true, | ||
mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD} | ||
) | ||
Double minimumLeadingTrailingSoftClippedRatio = null; | ||
|
||
// Command line parser requires a no-arg constructor | ||
public SoftClippedReadFilter() {} | ||
|
||
private boolean testMinSoftClippedRatio(final GATKRead read) { | ||
int totalLength = 0; | ||
int numSoftClippedBases = 0; | ||
for ( final CigarElement element : read.getCigarElements() ) { | ||
if (element.getOperator() == CigarOperator.S) { | ||
numSoftClippedBases += element.getLength(); | ||
} | ||
totalLength += element.getLength(); | ||
} | ||
|
||
final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength); | ||
|
||
return softClipRatio > minimumSoftClippedRatio; | ||
} | ||
|
||
private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) { | ||
|
||
if ( read.getCigarElements().size() < 1 ) { | ||
return false; | ||
} | ||
|
||
// Get the index of the last cigar element: | ||
final int lastCigarOpIndex = read.getCigarElements().size() - 1; | ||
|
||
// Calculate the number of bases here: | ||
// Note: we don't want to double-count if the read has only 1 cigar operator. | ||
final int numLeadingTrailingSoftClippedBases = | ||
(read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0) | ||
+ | ||
((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S) | ||
? read.getCigarElement(lastCigarOpIndex).getLength() | ||
: 0); | ||
|
||
// Determine length: | ||
final int totalLength = read.getCigarElements().stream() | ||
.mapToInt( CigarElement::getLength ) | ||
.sum(); | ||
|
||
// Calculate the ratio: | ||
final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength); | ||
|
||
return softClipRatio > minimumLeadingTrailingSoftClippedRatio; | ||
} | ||
|
||
@Override | ||
public boolean test(final GATKRead read) { | ||
|
||
final boolean result; | ||
|
||
// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they | ||
// have been specified. If they have, that's the filter logic we're using. | ||
// If we specified the clipping ratio, we use the min sequence length test: | ||
if ( minimumSoftClippedRatio != null ) { | ||
result = testMinSoftClippedRatio(read); | ||
} | ||
// If we specified the leading/trailing clipping ratio, we use the min sequence length test: | ||
else if ( minimumLeadingTrailingSoftClippedRatio != null ) { | ||
result = testMinLeadingTrailingSoftClippedRatio(read); | ||
} | ||
else { | ||
throw new UserException( | ||
"Must provide one of the following arguments: " + | ||
ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD + "," + | ||
ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD | ||
); | ||
} | ||
|
||
// Check for if we want to invert our results: | ||
if ( doInvertFilter ) { | ||
return !result; | ||
} | ||
return result; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
139 changes: 139 additions & 0 deletions
139
...test/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilterUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package org.broadinstitute.hellbender.engine.filters; | ||
|
||
import htsjdk.samtools.Cigar; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.TextCigarCodec; | ||
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; | ||
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.testng.Assert; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
import java.util.Iterator; | ||
import java.util.LinkedList; | ||
import java.util.List; | ||
|
||
/** | ||
* Unit tests for the {@link SoftClippedReadFilter} class. | ||
* Created by jonn on 6/13/19. | ||
*/ | ||
public class SoftClippedReadFilterUnitTest { | ||
|
||
private static final int CHR_COUNT = 1; | ||
private static final int CHR_START = 1; | ||
private static final int CHR_SIZE = 1000; | ||
private static final int GROUP_COUNT = 5; | ||
|
||
private final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); | ||
|
||
private GATKRead buildSAMRead(final String cigarString) { | ||
final Cigar cigar = TextCigarCodec.decode(cigarString); | ||
return ArtificialReadUtils.createArtificialRead(header, cigar); | ||
} | ||
|
||
@Test(dataProvider= "SoftClipRatioDataProvider") | ||
public void testOverclippedSoftClipRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final SoftClippedReadFilter filter = new SoftClippedReadFilter(); | ||
filter.minimumSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
|
||
filter.doInvertFilter = true; | ||
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); | ||
} | ||
|
||
@Test(dataProvider= "SoftClippedLeadingTrailingRatioDataProvider") | ||
public void testSoftClippedLeadingTrailingRatioFilter(final String cigarString, | ||
final double clipRatio, | ||
final boolean expectedResult) { | ||
|
||
final SoftClippedReadFilter filter = new SoftClippedReadFilter(); | ||
filter.minimumLeadingTrailingSoftClippedRatio = clipRatio; | ||
|
||
final GATKRead read = buildSAMRead(cigarString); | ||
Assert.assertEquals(filter.test(read), expectedResult, cigarString); | ||
|
||
filter.doInvertFilter = true; | ||
Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); | ||
} | ||
|
||
@DataProvider(name = "SoftClipRatioDataProvider") | ||
public Iterator<Object[]> softClipRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
@DataProvider(name = "SoftClippedLeadingTrailingRatioDataProvider") | ||
public Iterator<Object[]> softClippedLeadingTrailingRatioDataProvider() { | ||
final List<Object[]> testData = new LinkedList<>(); | ||
|
||
// --------------------------------------- | ||
// Null / trivial cases: | ||
testData.add(new Object[] { "", 0.1, false }); | ||
testData.add(new Object[] { "10H", 0.1, false }); | ||
|
||
// --------------------------------------- | ||
// Soft clip ratio test: | ||
|
||
// Non-leading/-trailing | ||
testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217 | ||
testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250 | ||
testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280 | ||
|
||
// Leading: | ||
testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280 | ||
|
||
// Trailing: | ||
testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100 | ||
testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143 | ||
testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182 | ||
testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217 | ||
testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250 | ||
testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280 | ||
|
||
// --------------------------------------- | ||
// Soft clip placement: | ||
|
||
testData.add(new Object[] { "101S100M", 0.5, true }); | ||
testData.add(new Object[] { "100M101S", 0.5, true }); | ||
testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false }); | ||
|
||
return testData.iterator(); | ||
} | ||
|
||
} |