diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java index b56c53aab36..6cc55e9a165 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java @@ -48,4 +48,9 @@ private ReadFilterArgumentDefinitions(){} public static final String SAMPLE_NAME = "sample"; public static final String KEEP_INTERVAL_NAME = "keep-intervals"; + + public static final String SOFT_CLIPPED_RATIO_THRESHOLD = "soft-clipped-ratio-threshold"; + public static final String SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD = "soft-clipped-leading-trailing-ratio"; + + public static final String INVERT_SOFT_CLIP_RATIO_FILTER = "invert-soft-clip-ratio-filter"; } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilter.java index 1b14f39632a..26a3f2750d9 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilter.java @@ -1,7 +1,10 @@ package org.broadinstitute.hellbender.engine.filters; +import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; @@ -20,26 +23,32 @@ *

Note: Consecutive soft-clipped blocks are treated as a single block. For example, 1S2S10M1S2S is treated as 3S10M3S

*/ @DocumentedFeature(groupName= HelpConstants.DOC_CAT_READFILTERS, groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filter out reads that are over-soft-clipped") -public final class OverclippedReadFilter extends ReadFilter{ - +public final class OverclippedReadFilter extends ReadFilter { static final long serialVersionUID = 1L; + private final Logger logger = LogManager.getLogger(this.getClass()); @Argument(fullName = ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME, doc = "Minimum number of aligned bases", optional = true) - public int minimumSequenceLength = 30; + private int minAlignedBases = 30; @Argument(fullName = ReadFilterArgumentDefinitions.DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME, doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must " + "have a soft-clipped block, setting this flag requires only 1 soft-clipped block", optional = true) - public boolean doNotRequireSoftClipsOnBothEnds; + private boolean doNotRequireSoftClipsOnBothEnds; // Command line parser requires a no-arg constructor public OverclippedReadFilter() {} - public OverclippedReadFilter(final int minimumSequenceLength, final boolean doNotRequireSoftClipsOnBothEnds) { - this.minimumSequenceLength = minimumSequenceLength; + @VisibleForTesting + OverclippedReadFilter(final int minAlignedBases) { + this.minAlignedBases = minAlignedBases; + } + + @VisibleForTesting + OverclippedReadFilter(final int minAlignedBases, final boolean doNotRequireSoftClipsOnBothEnds) { + this.minAlignedBases = minAlignedBases; this.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds; } @@ -47,7 +56,7 @@ public OverclippedReadFilter(final int minimumSequenceLength, final boolean doNo public boolean test(final GATKRead read) { int alignedLength = 0; int softClipBlocks = 0; - int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2; + final int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2; CigarOperator prevOperator = null; for ( final CigarElement element : read.getCigarElements() ) { @@ -63,6 +72,6 @@ public boolean test(final GATKRead read) { prevOperator = element.getOperator(); } - return(alignedLength >= minimumSequenceLength || softClipBlocks < minSoftClipBlocks); + return (alignedLength >= minAlignedBases || softClipBlocks < minSoftClipBlocks); } } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java new file mode 100644 index 00000000000..fa0421255fa --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java @@ -0,0 +1,127 @@ +package org.broadinstitute.hellbender.engine.filters; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.help.HelpConstants; +import org.broadinstitute.hellbender.utils.read.GATKRead; + +/** + * Filter out reads where the ratio of soft-clipped bases to total bases exceeds some given value. + * + * Can choose to filter by number of soft-clipped bases, or by the leading/trailing soft-clipped bases only. + */ +@DocumentedFeature( + groupName=HelpConstants.DOC_CAT_READFILTERS, + groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, + summary = "Filter out reads that are over-soft-clipped") +public final class SoftClippedReadFilter extends ReadFilter { + static final long serialVersionUID = 1L; + private final Logger logger = LogManager.getLogger(this.getClass()); + + @VisibleForTesting + @Argument(fullName = ReadFilterArgumentDefinitions.INVERT_SOFT_CLIP_RATIO_FILTER, + doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.", + optional = true + ) + boolean doInvertFilter = false; + + @VisibleForTesting + @Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD, + doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.", + optional = true, + mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD } + ) + Double minimumSoftClippedRatio = null; + + @VisibleForTesting + @Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD, + doc = "Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.", + optional = true, + mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD} + ) + Double minimumLeadingTrailingSoftClippedRatio = null; + + // Command line parser requires a no-arg constructor + public SoftClippedReadFilter() {} + + private boolean testMinSoftClippedRatio(final GATKRead read) { + int totalLength = 0; + int numSoftClippedBases = 0; + for ( final CigarElement element : read.getCigarElements() ) { + if (element.getOperator() == CigarOperator.S) { + numSoftClippedBases += element.getLength(); + } + totalLength += element.getLength(); + } + + final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength); + + return softClipRatio > minimumSoftClippedRatio; + } + + private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) { + + if ( read.getCigarElements().size() < 1 ) { + return false; + } + + // Get the index of the last cigar element: + final int lastCigarOpIndex = read.getCigarElements().size() - 1; + + // Calculate the number of bases here: + // Note: we don't want to double-count if the read has only 1 cigar operator. + final int numLeadingTrailingSoftClippedBases = + (read.getCigarElement(0).getOperator() == CigarOperator.S ? read.getCigarElement(0).getLength() : 0) + + + ((lastCigarOpIndex != 0) && (read.getCigarElement(lastCigarOpIndex).getOperator() == CigarOperator.S) + ? read.getCigarElement(lastCigarOpIndex).getLength() + : 0); + + // Determine length: + final int totalLength = read.getCigarElements().stream() + .mapToInt( CigarElement::getLength ) + .sum(); + + // Calculate the ratio: + final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength); + + return softClipRatio > minimumLeadingTrailingSoftClippedRatio; + } + + @Override + public boolean test(final GATKRead read) { + + final boolean result; + + // NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they + // have been specified. If they have, that's the filter logic we're using. + // If we specified the clipping ratio, we use the min sequence length test: + if ( minimumSoftClippedRatio != null ) { + result = testMinSoftClippedRatio(read); + } + // If we specified the leading/trailing clipping ratio, we use the min sequence length test: + else if ( minimumLeadingTrailingSoftClippedRatio != null ) { + result = testMinLeadingTrailingSoftClippedRatio(read); + } + else { + throw new UserException( + "Must provide one of the following arguments: " + + ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD + "," + + ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD + ); + } + + // Check for if we want to invert our results: + if ( doInvertFilter ) { + return !result; + } + return result; + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilterUnitTest.java index b4e761c7764..9c26ed7c3f1 100644 --- a/src/test/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/engine/filters/OverclippedReadFilterUnitTest.java @@ -24,22 +24,22 @@ public class OverclippedReadFilterUnitTest extends GATKBaseTest { private final SAMFileHeader header= ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); + private GATKRead buildSAMRead(final String cigarString) { + final Cigar cigar = TextCigarCodec.decode(cigarString); + return ArtificialReadUtils.createArtificialRead(header, cigar); + } @Test(dataProvider= "OverclippedDataProvider") - public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftClipsOnBothEnds, final boolean expectedResult) { + public void testOverclippedFilter(final String cigarString, + final boolean doNotRequireSoftClipsOnBothEnds, + final boolean expectedResult) { - final OverclippedReadFilter filter = new OverclippedReadFilter(30, false); - filter.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds; + final OverclippedReadFilter filter = new OverclippedReadFilter(30, doNotRequireSoftClipsOnBothEnds); final GATKRead read = buildSAMRead(cigarString); Assert.assertEquals(filter.test(read), expectedResult, cigarString); } - private GATKRead buildSAMRead(final String cigarString) { - final Cigar cigar = TextCigarCodec.decode(cigarString); - return ArtificialReadUtils.createArtificialRead(header, cigar); - } - - @DataProvider(name= "OverclippedDataProvider") + @DataProvider(name = "OverclippedDataProvider") public Iterator overclippedDataProvider() { final List result = new LinkedList<>(); diff --git a/src/test/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilterUnitTest.java new file mode 100644 index 00000000000..668a1c62a92 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilterUnitTest.java @@ -0,0 +1,139 @@ +package org.broadinstitute.hellbender.engine.filters; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Unit tests for the {@link SoftClippedReadFilter} class. + * Created by jonn on 6/13/19. + */ +public class SoftClippedReadFilterUnitTest { + + private static final int CHR_COUNT = 1; + private static final int CHR_START = 1; + private static final int CHR_SIZE = 1000; + private static final int GROUP_COUNT = 5; + + private final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeaderWithGroups(CHR_COUNT, CHR_START, CHR_SIZE, GROUP_COUNT); + + private GATKRead buildSAMRead(final String cigarString) { + final Cigar cigar = TextCigarCodec.decode(cigarString); + return ArtificialReadUtils.createArtificialRead(header, cigar); + } + + @Test(dataProvider= "SoftClipRatioDataProvider") + public void testOverclippedSoftClipRatioFilter(final String cigarString, + final double clipRatio, + final boolean expectedResult) { + + final SoftClippedReadFilter filter = new SoftClippedReadFilter(); + filter.minimumSoftClippedRatio = clipRatio; + + final GATKRead read = buildSAMRead(cigarString); + Assert.assertEquals(filter.test(read), expectedResult, cigarString); + + filter.doInvertFilter = true; + Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); + } + + @Test(dataProvider= "SoftClippedLeadingTrailingRatioDataProvider") + public void testSoftClippedLeadingTrailingRatioFilter(final String cigarString, + final double clipRatio, + final boolean expectedResult) { + + final SoftClippedReadFilter filter = new SoftClippedReadFilter(); + filter.minimumLeadingTrailingSoftClippedRatio = clipRatio; + + final GATKRead read = buildSAMRead(cigarString); + Assert.assertEquals(filter.test(read), expectedResult, cigarString); + + filter.doInvertFilter = true; + Assert.assertEquals(filter.test(read), !expectedResult, "Inverted case: " + cigarString); + } + + @DataProvider(name = "SoftClipRatioDataProvider") + public Iterator softClipRatioDataProvider() { + final List testData = new LinkedList<>(); + + // --------------------------------------- + // Null / trivial cases: + testData.add(new Object[] { "", 0.1, false }); + testData.add(new Object[] { "10H", 0.1, false }); + + // --------------------------------------- + // Soft clip ratio test: + + testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 + testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 + testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 + testData.add(new Object[] { "1S1M4S17M", 0.2, true }); // 5/23 = .217 + testData.add(new Object[] { "1S1M5S17M", 0.2, true }); // 6/24 = .250 + testData.add(new Object[] { "1S1M6S17M", 0.2, true }); // 7/25 = .280 + + // --------------------------------------- + // Soft clip placement: + + testData.add(new Object[] { "101S100M", 0.5, true }); + testData.add(new Object[] { "100M101S", 0.5, true }); + testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, true }); + + return testData.iterator(); + } + + @DataProvider(name = "SoftClippedLeadingTrailingRatioDataProvider") + public Iterator softClippedLeadingTrailingRatioDataProvider() { + final List testData = new LinkedList<>(); + + // --------------------------------------- + // Null / trivial cases: + testData.add(new Object[] { "", 0.1, false }); + testData.add(new Object[] { "10H", 0.1, false }); + + // --------------------------------------- + // Soft clip ratio test: + + // Non-leading/-trailing + testData.add(new Object[] { "1S1M1S17M", 0.2, false }); // 2/20 = .100 + testData.add(new Object[] { "1S1M2S17M", 0.2, false }); // 3/21 = .143 + testData.add(new Object[] { "1S1M3S17M", 0.2, false }); // 4/22 = .182 + testData.add(new Object[] { "1S1M4S17M", 0.2, false }); // 5/23 = .217 + testData.add(new Object[] { "1S1M5S17M", 0.2, false }); // 6/24 = .250 + testData.add(new Object[] { "1S1M6S17M", 0.2, false }); // 7/25 = .280 + + // Leading: + testData.add(new Object[] { "2S1S1S16M", 0.2, false }); // 2/20 = .100 + testData.add(new Object[] { "3S1S1S16M", 0.2, false }); // 3/21 = .143 + testData.add(new Object[] { "4S1S1S16M", 0.2, false }); // 4/22 = .182 + testData.add(new Object[] { "5S1S1S16M", 0.2, true }); // 5/23 = .217 + testData.add(new Object[] { "6S1S1S16M", 0.2, true }); // 6/24 = .250 + testData.add(new Object[] { "7S1S1S16M", 0.2, true }); // 7/25 = .280 + + // Trailing: + testData.add(new Object[] { "1M1S16M2S", 0.2, false }); // 2/20 = .100 + testData.add(new Object[] { "1M1S16M3S", 0.2, false }); // 3/21 = .143 + testData.add(new Object[] { "1M1S16M4S", 0.2, false }); // 4/22 = .182 + testData.add(new Object[] { "1M1S16M5S", 0.2, true }); // 5/23 = .217 + testData.add(new Object[] { "1M1S16M6S", 0.2, true }); // 6/24 = .250 + testData.add(new Object[] { "1M1S16M7S", 0.2, true }); // 7/25 = .280 + + // --------------------------------------- + // Soft clip placement: + + testData.add(new Object[] { "101S100M", 0.5, true }); + testData.add(new Object[] { "100M101S", 0.5, true }); + testData.add(new Object[] { "25H20S10M20S10M20S10M20S10M20S10M20S25H", 0.5, false }); + + return testData.iterator(); + } + +}