respnoded to comments without deleting the class

broadinstitute · Nov 6, 2018 · 298349b · 298349b
1 parent ce1b586
commit 298349b
Show file tree

Hide file tree

Showing 11 changed files with 66 additions and 144 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java
@@ -11,7 +11,6 @@
 import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils;
 import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
 import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
-import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
 import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*;
 
 import java.util.Collections;
@@ -93,7 +92,5 @@ private void registerGATKClasses(Kryo kryo) {
         kryo.register(MarkDuplicatesSparkUtils.IndexPair.class, new FieldSerializer(kryo, MarkDuplicatesSparkUtils.IndexPair.class));
         kryo.register(ReadsKey.class, new FieldSerializer(kryo, ReadsKey.class));
         kryo.register(ReadsKey.KeyForFragment.class, new FieldSerializer(kryo, ReadsKey.KeyForFragment.class));
-        kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class));
-        kryo.register(SerializableOpticalDuplicatesFinder.class, new FieldSerializer(kryo, SerializableOpticalDuplicatesFinder.class));
-    }
+        kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class)); }
 }
diff --git a/...rg/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java b/...rg/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java
@@ -10,7 +10,6 @@
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
 import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
-import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
 import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
@@ -21,7 +20,6 @@
 import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
 import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
-import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
 import picard.sam.markduplicates.util.OpticalDuplicateFinder;
 
 import java.io.IOException;
@@ -68,7 +66,7 @@ protected void runTool(final JavaSparkContext ctx) {
         try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) {
             final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
             final JavaRDD<GATKRead> alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
-            final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
+            final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
             try {
                 ReadsSparkSink.writeReads(ctx, output,
                         referenceArguments.getReferencePath().toAbsolutePath().toUri().toString(),

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java
@@ -31,11 +31,11 @@
 import org.broadinstitute.hellbender.utils.IntervalUtils;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
-import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
 import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
 import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
 import org.broadinstitute.hellbender.utils.spark.SparkUtils;
 import org.broadinstitute.hellbender.utils.variant.GATKVariant;
+import picard.sam.markduplicates.util.OpticalDuplicateFinder;
 
 import java.util.Collection;
 import java.util.List;
@@ -174,7 +174,7 @@ protected void runTool(final JavaSparkContext ctx) {
             header = getHeaderForReads();
         }
 
-        final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
+        final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
 
         // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
         final SAMFileHeader readsHeader = header.clone();

diff --git a/.../broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/.../broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
@@ -1,6 +1,5 @@
 package org.broadinstitute.hellbender.tools.spark.transforms.markduplicates;
 
-import htsjdk.samtools.Defaults;
 import htsjdk.samtools.SAMFileHeader;
 import htsjdk.samtools.metrics.MetricsFile;
 import org.apache.spark.Partitioner;
@@ -25,15 +24,13 @@
 import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
 import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
 import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
-import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
 import org.broadinstitute.hellbender.utils.spark.SparkUtils;
 import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
 import picard.sam.markduplicates.MarkDuplicates;
 import picard.sam.markduplicates.util.OpticalDuplicateFinder;
 import scala.Tuple2;
 
 import java.util.*;
-import java.util.stream.Collectors;
 
 @DocumentedFeature
 @CommandLineProgramProperties(
@@ -66,7 +63,7 @@ public final class MarkDuplicatesSpark extends GATKSparkTool {
             mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS}, optional = true)
     public boolean removeAllDuplicates = false;
 
-    @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.",
+    @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.",
             mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true)
     public boolean removeSequencingDuplicates = false;
 
@@ -76,9 +73,9 @@ public List<ReadFilter> getDefaultReadFilters() {
     }
 
     // Reads with this marker will be treated as non-duplicates always
-    public static int MARKDUPLICATES_NO_OPTICAL_MARKER = -1;
+    public static int NO_OPTICAL_MARKER = -1;
     // Reads with this marker will be treated and marked as optical duplicates
-    public static int MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER = -2;
+    public static int OPTICAL_DUPLICATE_MARKER = -2;
 
     /**
      * Main method for marking duplicates, takes an JavaRDD of GATKRead and an associated SAMFileHeader with corresponding
@@ -135,16 +132,16 @@ public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFil
                     .peek(read -> {
                         // Handle reads that have been marked as non-duplicates (which also get tagged with optical duplicate summary statistics)
                         if (namesOfNonDuplicateReadsAndOpticalCounts.containsKey(read.getName())) {
-                            // If its an optical duplicate, mark it.
-                            if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER) {
+                            // If its an optical duplicate, mark it. (Note: we only expect these to exist if optical duplicate marking is on)
+                            if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == OPTICAL_DUPLICATE_MARKER) {
                                 read.setIsDuplicate(true);
                                 read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, MarkDuplicates.DUPLICATE_TYPE_SEQUENCING);
 
                             // Otherwise treat it normally as a non-duplicate.
                             } else {
                                 read.setIsDuplicate(false);
                                 if (markUnmappedMates || !read.isUnmapped()) {
-                                    int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), MARKDUPLICATES_NO_OPTICAL_MARKER);
+                                    int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), NO_OPTICAL_MARKER);
                                     if (dupCount > -1) {
                                         ((SAMRecordToGATKReadAdapter) read).setTransientAttribute(MarkDuplicatesSparkUtils.OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME, dupCount);
                                     }
@@ -228,7 +225,7 @@ public int getPartition(Object key) {
     protected void runTool(final JavaSparkContext ctx) {
         JavaRDD<GATKRead> reads = getReads();
         final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
-                new SerializableOpticalDuplicatesFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE) : null;
+                new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
         // If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag.
         if (removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) {
             markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly;

diff --git a/...dinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/...dinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java
@@ -387,7 +387,7 @@ private static Map<MarkDuplicatesSparkRecord.Type, List<MarkDuplicatesSparkRecor
     private static List<Tuple2<IndexPair<String>,Integer>> handlePassthroughs(List<MarkDuplicatesSparkRecord> passthroughs) {
         // Emit the passthrough reads as non-duplicates.
         return passthroughs.stream()
-                .map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_NO_OPTICAL_MARKER))
+                .map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.NO_OPTICAL_MARKER))
                 .collect(Collectors.toList());
     }
 
@@ -426,7 +426,7 @@ private static int countOpticalDuplicates(OpticalDuplicateFinder finder, List<Pa
             if (opticalDuplicateFlags[i]) {
                 numOpticalDuplicates++;
                 if (opticalDuplicateList != null) {
-                    opticalDuplicateList.add(new Tuple2<>(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER));
+                    opticalDuplicateList.add(new Tuple2<>(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.OPTICAL_DUPLICATE_MARKER));
                 }
             }
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java b/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java
@@ -17,27 +17,27 @@
 
 @DocumentedFeature
 @CommandLineProgramProperties(
-        summary = "Compares the base qualities of two SAM/BAM/CRAM files. The reads in the two files must have " +
-                "exactly the same names and appear in the same order.",
+        summary = "Compares the base qualities, cigars, alignment information, and samflags of reads between two SAM/BAM/CRAM files." +
+                " The reads in the two files must have exactly the same names and appear in the same order.",
         oneLineSummary = "Compares the base qualities of two SAM/BAM/CRAM files",
         programGroup = DiagnosticsAndQCProgramGroup.class
 )
 public class CompareReads extends GATKTool {
-    @Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the first specified file",
-            shortName = "I1", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = true)
+    @Argument(doc = "The first sam file against which to compare equality",
+            shortName = "I1", fullName = "input1", optional = false)
     protected String input1;
 
-    @Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the second specified input file",
-            shortName = "I2", fullName = "output2", optional = true)
+    @Argument(doc = "The second sam file against which to compare equality",
+            shortName = "I2", fullName = "input2", optional = false)
     protected String input2;
 
 
     @Override
     public void traverse() {
         List<String> errorMessages = new ArrayList<>();
 
-        try(        ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1));
-                    ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) {
+        try(ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1));
+            ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) {
             final Iterator<GATKRead> it1 = reads1.iterator();
             final Iterator<GATKRead> it2 = reads2.iterator();
             while (it1.hasNext() && it2.hasNext()) {

diff --git a/...adinstitute/hellbender/utils/read/markduplicates/SerializableOpticalDuplicatesFinder.java b/...adinstitute/hellbender/utils/read/markduplicates/SerializableOpticalDuplicatesFinder.java