diff --git a/.gitignore b/.gitignore index 73f7a4ae53..dd961eb0f9 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ jacoco.data .gradle build *.swp - +.vscode # OSX file system stuff .DS_STORE diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java index 2e51a46e6f..116faf0c00 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowArgumentCollection.java @@ -4,12 +4,18 @@ public class MarkDuplicatesForFlowArgumentCollection { + public enum FLOW_DUPLICATE_SELECTION_STRATEGY { + FLOW_QUALITY_SUM_STRATEGY, + FLOW_END_QUALITY_STRATEGY + } @Argument(doc = "enable parameters and behavior specific to flow based reads.", optional = true) public boolean FLOW_MODE = false; - @Argument(doc = "Use specific quality summing strategy for flow based reads. The strategy ensures that the same " + - "(and correct) quality value is used for all bases of the same homopolymer.", optional = true) - public boolean FLOW_QUALITY_SUM_STRATEGY = false; + @Argument(doc = "Use specific quality summing strategy for flow based reads. Two strategies are available: " + + "FLOW_QUALITY_SUM_STRATEG: The selects the read with the best total homopolymer quality." + + " FLOW_END_QUALITY_STRATEGY: The strategy selects the read with the best homopolymer quality close to the end (10 bases) of the read. " + + " The latter strategy is recommended for samples with high duplication rate ", optional = true) + public FLOW_DUPLICATE_SELECTION_STRATEGY FLOW_DUP_STRATEGY = FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY; @Argument(doc = "Make the end location of single end read be significant when considering duplicates, " + "in addition to the start location, which is always significant (i.e. require single-ended reads to start and" + @@ -26,6 +32,11 @@ public class MarkDuplicatesForFlowArgumentCollection { "(for this argument, \"read end\" means 3' end)", optional = true) public int UNPAIRED_END_UNCERTAINTY = 0; + @Argument(doc = "Maximal difference of the read start position that counted as equal. Useful for flow based " + + "reads where the end position might vary due to sequencing errors. " + + "(for this argument, \"read start\" means 5' end in the direction of sequencing)", optional = true) + public int UNPAIRED_START_UNCERTAINTY = 0; + @Argument(doc = "Skip first N flows, starting from the read's start, when considering duplicates. Useful for flow based reads where sometimes there " + "is noise in the first flows " + "(for this argument, \"read start\" means 5' end).", optional = true) diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java index 2a1e408691..2d3611a7b4 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelper.java @@ -30,6 +30,7 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.Log; import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates; +import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes; import java.util.ArrayList; import java.util.List; @@ -59,6 +60,7 @@ public class MarkDuplicatesForFlowHelper implements MarkDuplicatesHelper { public static final char[] CLIPPING_TAG_CONTAINS_AQ = {'A', 'Q'}; public static final char[] CLIPPING_TAG_CONTAINS_QZ = {'Q', 'Z'}; + public static final int DIST_FROM_END = 10; // instance of hosting MarkDuplicates private final MarkDuplicates md; @@ -105,19 +107,24 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in ReadEndsForMarkDuplicates firstOfNextChunk = null; int nextChunkRead1Coordinate2Min = Integer.MAX_VALUE; int nextChunkRead1Coordinate2Max = Integer.MIN_VALUE; + int nextChunkRead1Coordinate1Min = Integer.MAX_VALUE; + int nextChunkRead1Coordinate1Max = Integer.MIN_VALUE; + final List nextChunk = new ArrayList<>(200); boolean containsPairs = false; boolean containsFrags = false; for (final ReadEndsForMarkDuplicates next : md.fragSort) { if (firstOfNextChunk != null && areComparableForDuplicatesWithEndSignificance(firstOfNextChunk, next, useBarcodes, - nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max)) { + nextChunkRead1Coordinate2Min, nextChunkRead1Coordinate2Max, nextChunkRead1Coordinate1Min, nextChunkRead1Coordinate1Max)) { nextChunk.add(next); containsPairs = containsPairs || next.isPaired(); containsFrags = containsFrags || !next.isPaired(); if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) { nextChunkRead1Coordinate2Min = Math.min(nextChunkRead1Coordinate2Min, next.read2Coordinate); nextChunkRead1Coordinate2Max = Math.max(nextChunkRead1Coordinate2Max, next.read2Coordinate); + nextChunkRead1Coordinate1Min = Math.min(nextChunkRead1Coordinate1Min, next.read1Coordinate); + nextChunkRead1Coordinate1Max = Math.max(nextChunkRead1Coordinate1Max, next.read1Coordinate); if ( firstOfNextChunk.read2Coordinate == END_INSIGNIFICANT_VALUE) firstOfNextChunk = next; @@ -129,6 +136,7 @@ public void generateDuplicateIndexes(final boolean useBarcodes, final boolean in nextChunk.clear(); nextChunk.add(next); firstOfNextChunk = next; + nextChunkRead1Coordinate1Min = nextChunkRead1Coordinate1Max = next.read1Coordinate; if ( next.read2Coordinate != END_INSIGNIFICANT_VALUE) nextChunkRead1Coordinate2Min = nextChunkRead1Coordinate2Max = next.read2Coordinate; else { @@ -172,9 +180,7 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final } // adjust score - if ( md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) { - ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate); - } + ends.score = computeFlowDuplicateScore(rec, ends.read2Coordinate); return ends; } @@ -184,22 +190,53 @@ public ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final */ @Override public short getReadDuplicateScore(final SAMRecord rec, final ReadEndsForMarkDuplicates pairedEnds) { - if (md.flowBasedArguments.FLOW_QUALITY_SUM_STRATEGY ) { + if (md.flowBasedArguments.FLOW_MODE){ return computeFlowDuplicateScore(rec, pairedEnds.read2Coordinate); } else { return md.getReadDuplicateScore(rec, pairedEnds); } } + //This method is identical to MarkDuplicates.areComparableForDuplicates but allows working with readStartUncertainty + protected boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, + final boolean compareRead2, final boolean useBarcodes) { + boolean areComparable = lhs.libraryId == rhs.libraryId; + + if (useBarcodes && areComparable) { // areComparable is useful here to avoid the casts below + final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs; + final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs; + areComparable = lhsWithBarcodes.barcode == rhsWithBarcodes.barcode && + lhsWithBarcodes.readOneBarcode == rhsWithBarcodes.readOneBarcode && + lhsWithBarcodes.readTwoBarcode == rhsWithBarcodes.readTwoBarcode; + } + + if (areComparable) { + areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex && + lhs.orientation == rhs.orientation; + } + + if (areComparable && compareRead2) { + areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex && + lhs.read2Coordinate == rhs.read2Coordinate; + } + + return areComparable; + } + /** * This method is identical in function to areComparableForDuplicates except that it accomodates for * the possible significance of the end side of the reads (w/ or wo/ uncertainty). This is only * applicable for flow mode invocation. */ private boolean areComparableForDuplicatesWithEndSignificance(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean useBarcodes, - final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max) { - boolean areComparable = md.areComparableForDuplicates(lhs, rhs, false, useBarcodes); + final int lhsRead1Coordinate2Min, final int lhsRead1Coordinate2Max, + final int lhsRead1Coordinate1Min, final int lhsRead1Coordinate1Max) { + boolean areComparable = areComparableForDuplicates(lhs, rhs, false, useBarcodes); + if (areComparable) { + areComparable = endCoorInRangeWithUncertainty(lhsRead1Coordinate1Min, lhsRead1Coordinate1Max, + rhs.read1Coordinate, md.flowBasedArguments.UNPAIRED_START_UNCERTAINTY); + } if (areComparable) { areComparable = (!endCoorSignificant(lhs.read2Coordinate, rhs.read2Coordinate) || endCoorInRangeWithUncertainty(lhsRead1Coordinate2Min, lhsRead1Coordinate2Max, @@ -236,8 +273,8 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th final byte[] bases = rec.getReadBases(); // create iteration range and direction - final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length; - final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : 0; + final int startingOffset = !rec.getReadNegativeStrandFlag() ? 0 : bases.length - 1; + final int endOffset = !rec.getReadNegativeStrandFlag() ? bases.length : -1; final int iterIncr = !rec.getReadNegativeStrandFlag() ? 1 : -1; // loop on bases, extract qual related to homopolymer from start of homopolymer @@ -257,6 +294,55 @@ static protected int getFlowSumOfBaseQualities(final SAMRecord rec, final int th return score; } + /** + * A quality selection strategy used for flow based reads. + * + * We look at the bases of the reads that are close to the ends of the fragment + * and calculate the minimal quality of the homopolymers. + * + * @param rec - SAMRecord to get a score for + * @param dist - Distance from the end + * @return - calculated score (see method description) + */ + static protected int getFlowSumOfBaseQualitiesNearEnds(final SAMRecord rec, int dist) { + int score = 100; + + // access qualities and bases + final byte[] quals = rec.getBaseQualities(); + final byte[] bases = rec.getReadBases(); + + boolean insideHpol = false; + if (dist > bases.length){ + dist = bases.length; + } + + for ( int i = 0 ; (i < dist) || ( insideHpol ) ; i ++ ) { + final byte base = bases[i]; + if ( (i == bases.length - 1) || ( base != bases[i+1] )) { + insideHpol = false; + } else { + insideHpol = true; + } + + if ( quals[i] < score) { + score = quals[i]; + } + } + + for ( int i = bases.length-1 ; (i > bases.length - 1 - dist) || ( insideHpol ) ; i -- ) { + final byte base = bases[i]; + if ( (i == 0) || ( base != bases[i - 1] )) { + insideHpol = false; + } else { + insideHpol = true; + } + + if ( quals[i] < score) { + score = quals[i]; + } + } + return score; + } private short computeFlowDuplicateScore(final SAMRecord rec, final int end) { if ( end == END_INSIGNIFICANT_VALUE) @@ -265,8 +351,13 @@ private short computeFlowDuplicateScore(final SAMRecord rec, final int end) { Short storedScore = (Short)rec.getTransientAttribute(ATTR_DUPLICATE_SCORE); if ( storedScore == null ) { short score = 0; - - score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2); + if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_QUALITY_SUM_STRATEGY) { + score += (short) Math.min(getFlowSumOfBaseQualities(rec, md.flowBasedArguments.FLOW_EFFECTIVE_QUALITY_THRESHOLD), Short.MAX_VALUE / 2); + } else if (md.flowBasedArguments.FLOW_DUP_STRATEGY == MarkDuplicatesForFlowArgumentCollection.FLOW_DUPLICATE_SELECTION_STRATEGY.FLOW_END_QUALITY_STRATEGY) { + score += (short) Math.min(getFlowSumOfBaseQualitiesNearEnds(rec, DIST_FROM_END), Short.MAX_VALUE / 2); + } else { + throw new IllegalArgumentException("Unknown flow duplicate selection strategy: " + md.flowBasedArguments.FLOW_DUP_STRATEGY); + } score += rec.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; storedScore = score; diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java index cfddeacf0a..b8d033308c 100644 --- a/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesForFlowHelperTest.java @@ -166,14 +166,16 @@ public Object[][] forFlowDataProvider() { new TestRecordInfo(76, 12,"76M", true, "AAAC", null), new TestRecordInfo(76, 12, "76M", false, "AACC", null) }, - new String[] { "FLOW_QUALITY_SUM_STRATEGY=false" }, + new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" }, new TesterModifier() { @Override public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); records[0].setAttribute("tp", new int[76]); records[1].setAttribute("tp", new int[76]); - records[0].getBaseQualities()[1] = 25; // dip inside AAA + records[0].getBaseQualities()[0] = 25; // dip inside AAA + records[0].getBaseQualities()[2] = 25; // dip inside AAA + } } }, @@ -185,7 +187,25 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) new TestRecordInfo(76, 12,"76M", false, "AAAC", null), new TestRecordInfo(76, 12, "76M", true, "AACC", null) }, - new String[] { "FLOW_QUALITY_SUM_STRATEGY=true" }, + new String[] { "FLOW_DUP_STRATEGY=FLOW_QUALITY_SUM_STRATEGY" }, + new TesterModifier() { + @Override + public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { + final SAMRecord[] records = tester.getSamRecordSetBuilder().getRecords().toArray(new SAMRecord[0]); + records[0].setAttribute("tp", new int[76]); + records[1].setAttribute("tp", new int[76]); + records[0].getBaseQualities()[1] = 25; // dip inside AAA + } + } + }, + // testFLOW_END_QUALITY_STRATEGY: flow (homopolymer based) minumum + { + DuplicateScoringStrategy.ScoringStrategy.SUM_OF_BASE_QUALITIES, + new TestRecordInfo[] { + new TestRecordInfo(76, 12,"76M", true, "AAAC", null), + new TestRecordInfo(76, 12, "76M", false, "AACC", null) + }, + new String[] { "FLOW_DUP_STRATEGY=FLOW_END_QUALITY_STRATEGY" }, new TesterModifier() { @Override public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) { @@ -193,6 +213,7 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) records[0].setAttribute("tp", new int[76]); records[1].setAttribute("tp", new int[76]); records[0].getBaseQualities()[1] = 25; // dip inside AAA + records[1].getBaseQualities()[30] = 10; } } }, @@ -208,6 +229,17 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_END_UNCERTAINTY=10" }, null }, + // testUNPAIRED_START_UNCERTAINTY: End location is significant and uncertain, end sorted + { + null, + new TestRecordInfo[] { + new TestRecordInfo(74, 12, null, false, null, null), + new TestRecordInfo(64, 22, null, true, null, null), + new TestRecordInfo(54, 32, null, true, null, null) + }, + new String[] { "USE_END_IN_UNPAIRED_READS=true", "UNPAIRED_START_UNCERTAINTY=10" }, null + }, + // testUNPAIRED_END_UNCERTAINTY: End location is significant and uncertain, end not sorted { null, @@ -267,7 +299,7 @@ public void modify(final AbstractMarkDuplicatesCommandLineProgramTester tester) } @Test(dataProvider = "forFlowDataProvider") - public void testForFlow(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) { + public void testForFlowMDCall(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy, final TestRecordInfo[] recInfos, final String[] params, TesterModifier modifier) { // get tester, build records final AbstractMarkDuplicatesCommandLineProgramTester tester = @@ -339,4 +371,33 @@ public void testGetFlowSumOfBaseQualities(final String bases, final byte[] quals Assert.assertEquals(score, expectedScore); } + @DataProvider(name ="getFlowEndBaseQualitiesDataProvider") + public Object[][] getFlowEndBaseQualitiesDataProvider() { + return new Object[][] { + { "AAAA", new byte[] {50,50,50,50}, 2, 50 }, + { "AAAA", new byte[] {50,10,10,50}, 4, 10 }, + { "ACCA", new byte[] {20,10,10,20}, 1, 20 }, + { "AABBCC", new byte[] {50,50,10,10,40,40}, 30, 10 }, + }; + } + + @Test(dataProvider = "getFlowEndBaseQualitiesDataProvider") + public void testGetFlowEndBaseQualities(final String bases, final byte[] quals, final int threshold, final int expectedScore) { + + // build record + final AbstractMarkDuplicatesCommandLineProgramTester tester = getTester(); + tester.getSamRecordSetBuilder().setReadLength(bases.length()); + tester.addMappedFragment(0, 12, false, 50); + + // install bases and quals + final SAMRecord rec = tester.getSamRecordSetBuilder().getRecords().iterator().next(); + System.arraycopy(bases.getBytes(), 0, rec.getReadBases(), 0,bases.length()); + System.arraycopy(quals, 0, rec.getBaseQualities(), 0, quals.length); + + // calculate score + final int score = MarkDuplicatesForFlowHelper.getFlowSumOfBaseQualitiesNearEnds(rec, threshold); + Assert.assertEquals(score, expectedScore); + } + + }