-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Updated MarkDuplicates to use Picard metrics code #4779
Changes from 7 commits
7a91cc3
83aae26
b9eaade
0be29f0
3514193
e21f9bd
66e477e
6c23b51
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
package org.broadinstitute.hellbender.tools.spark.transforms.markduplicates; | ||
|
||
import com.google.common.collect.Iterators; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.metrics.MetricsFile; | ||
import org.apache.spark.Partitioner; | ||
|
@@ -23,7 +24,7 @@ | |
import org.broadinstitute.hellbender.utils.read.GATKRead; | ||
import org.broadinstitute.hellbender.utils.read.ReadUtils; | ||
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.DuplicationMetrics; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; | ||
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; | ||
import picard.sam.markduplicates.util.OpticalDuplicateFinder; | ||
|
@@ -91,6 +92,7 @@ public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFil | |
final MarkDuplicatesScoringStrategy scoringStrategy, | ||
final OpticalDuplicateFinder opticalDuplicateFinder, | ||
final int numReducers, final boolean dontMarkUnmappedMates) { | ||
final boolean markUnmappedMates = !dontMarkUnmappedMates; | ||
SAMFileHeader headerForTool = header.clone(); | ||
|
||
// If the input isn't queryname sorted, sort it before duplicate marking | ||
|
@@ -107,11 +109,13 @@ public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFil | |
// Here we combine the original bam with the repartitioned unmarked readnames to produce our marked reads | ||
return sortedReadsForMarking.zipPartitions(repartitionedReadNames, (readsIter, readNamesIter) -> { | ||
final Map<String,Integer> namesOfNonDuplicateReadsAndOpticalCounts = Utils.stream(readNamesIter).collect(Collectors.toMap(Tuple2::_1,Tuple2::_2, (t1,t2) -> {throw new GATKException("Detected multiple mark duplicate records objects corresponding to read with name, this could be the result of readnames spanning more than one partition");})); | ||
return Utils.stream(readsIter).peek(read -> { | ||
return Utils.stream(readsIter) | ||
.peek(read -> read.setIsDuplicate(false)) | ||
.peek(read -> { | ||
// Handle reads that have been marked as non-duplicates (which also get tagged with optical duplicate summary statistics) | ||
if( namesOfNonDuplicateReadsAndOpticalCounts.containsKey(read.getName())) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. spacing around if appears incorrect |
||
read.setIsDuplicate(false); | ||
if (!(dontMarkUnmappedMates && read.isUnmapped())) { | ||
if ( markUnmappedMates || !read.isUnmapped()) { | ||
int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), -1); | ||
if (dupCount > -1) { | ||
((SAMRecordToGATKReadAdapter) read).setTransientAttribute(MarkDuplicatesSparkUtils.OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME, dupCount); | ||
|
@@ -122,8 +126,10 @@ public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFil | |
read.setIsDuplicate(false); | ||
// Everything else is a duplicate | ||
} else{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. space after else |
||
if (!(dontMarkUnmappedMates && read.isUnmapped())) { | ||
if ( markUnmappedMates || !read.isUnmapped()) { | ||
read.setIsDuplicate(true); | ||
} else { | ||
read.setIsDuplicate(false); | ||
} | ||
} | ||
}).iterator(); | ||
|
@@ -182,9 +188,9 @@ protected void runTool(final JavaSparkContext ctx) { | |
final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, header, markDuplicatesSparkArgumentCollection.duplicatesScoringStrategy, finder, getRecommendedNumReducers(), markDuplicatesSparkArgumentCollection.dontMarkUnmappedMates); | ||
|
||
if (metricsFile != null) { | ||
final JavaPairRDD<String, DuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics( | ||
final JavaPairRDD<String, GATKDuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics( | ||
header, finalReadsForMetrics); | ||
final MetricsFile<DuplicationMetrics, Double> resultMetrics = getMetricsFile(); | ||
final MetricsFile<GATKDuplicationMetrics, Double> resultMetrics = getMetricsFile(); | ||
MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, header, metricsByLibrary, metricsFile); | ||
} | ||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -323,6 +323,14 @@ public static boolean readHasMappedMate( final GATKRead read ) { | |
return read.isPaired() && ! read.mateIsUnmapped(); | ||
} | ||
|
||
/** | ||
* @param read read to check | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sounds confusing, could you rephrase? |
||
* @return true if the read is paired and has a mapped mate, otherwise false | ||
*/ | ||
public static boolean readHasMappedMate( final SAMRecord read ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do spaces go around the argument? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thats a good question, just looking in this file alone it seems to be wholly inconsistent one way or another. I'm not going to bother changing it because it would imply that there was some principled reason for me to unify ALL the formatting in this file at least which I don't necessarily want to do in this PR. |
||
return read.getReadPairedFlag() && ! read.getMateUnmappedFlag(); | ||
} | ||
|
||
/** | ||
* Check whether the given String represents a legal attribute name according to the SAM spec, | ||
* and throw an exception if it doesn't. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This import appears to not be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I realize it isn't your code, but can you remove the other two rouge imports....
line 21
import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource;
and
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done