diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java index 6cbf84ab5d2..bff76a36715 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java @@ -11,7 +11,6 @@ import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils; import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*; import java.util.Collections; @@ -93,7 +92,5 @@ private void registerGATKClasses(Kryo kryo) { kryo.register(MarkDuplicatesSparkUtils.IndexPair.class, new FieldSerializer(kryo, MarkDuplicatesSparkUtils.IndexPair.class)); kryo.register(ReadsKey.class, new FieldSerializer(kryo, ReadsKey.class)); kryo.register(ReadsKey.KeyForFragment.class, new FieldSerializer(kryo, ReadsKey.KeyForFragment.class)); - kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class)); - kryo.register(SerializableOpticalDuplicatesFinder.class, new FieldSerializer(kryo, SerializableOpticalDuplicatesFinder.class)); - } + kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class)); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java index 006bbe5a73b..3a80995ed16 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/BwaAndMarkDuplicatesPipelineSpark.java @@ -10,7 +10,6 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection; import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; @@ -21,7 +20,6 @@ import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat; -import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; import picard.sam.markduplicates.util.OpticalDuplicateFinder; import java.io.IOException; @@ -68,7 +66,7 @@ protected void runTool(final JavaSparkContext ctx) { try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) { final ReadFilter filter = makeReadFilter(bwaEngine.getHeader()); final JavaRDD alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test); - final JavaRDD markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); + final JavaRDD markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); try { ReadsSparkSink.writeReads(ctx, output, referenceArguments.getReferencePath().toAbsolutePath().toUri().toString(), diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java index 002940a4557..3b9061a03a6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/ReadsPipelineSpark.java @@ -31,11 +31,11 @@ import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection; import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport; import org.broadinstitute.hellbender.utils.spark.SparkUtils; import org.broadinstitute.hellbender.utils.variant.GATKVariant; +import picard.sam.markduplicates.util.OpticalDuplicateFinder; import java.util.Collection; import java.util.List; @@ -174,7 +174,7 @@ protected void runTool(final JavaSparkContext ctx) { header = getHeaderForReads(); } - final JavaRDD markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); + final JavaRDD markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers()); // always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource final SAMFileHeader readsHeader = header.clone(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java index f7ab7cb166c..3f5116ea207 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java @@ -1,6 +1,5 @@ package org.broadinstitute.hellbender.tools.spark.transforms.markduplicates; -import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.metrics.MetricsFile; import org.apache.spark.Partitioner; @@ -25,7 +24,6 @@ import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import picard.sam.markduplicates.MarkDuplicates; @@ -33,7 +31,6 @@ import scala.Tuple2; import java.util.*; -import java.util.stream.Collectors; @DocumentedFeature @CommandLineProgramProperties( @@ -66,7 +63,7 @@ public final class MarkDuplicatesSpark extends GATKSparkTool { mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS}, optional = true) public boolean removeAllDuplicates = false; - @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.", + @Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.", mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true) public boolean removeSequencingDuplicates = false; @@ -76,9 +73,9 @@ public List getDefaultReadFilters() { } // Reads with this marker will be treated as non-duplicates always - public static int MARKDUPLICATES_NO_OPTICAL_MARKER = -1; + public static int NO_OPTICAL_MARKER = -1; // Reads with this marker will be treated and marked as optical duplicates - public static int MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER = -2; + public static int OPTICAL_DUPLICATE_MARKER = -2; /** * Main method for marking duplicates, takes an JavaRDD of GATKRead and an associated SAMFileHeader with corresponding @@ -135,8 +132,8 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil .peek(read -> { // Handle reads that have been marked as non-duplicates (which also get tagged with optical duplicate summary statistics) if (namesOfNonDuplicateReadsAndOpticalCounts.containsKey(read.getName())) { - // If its an optical duplicate, mark it. - if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER) { + // If its an optical duplicate, mark it. (Note: we only expect these to exist if optical duplicate marking is on) + if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == OPTICAL_DUPLICATE_MARKER) { read.setIsDuplicate(true); read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, MarkDuplicates.DUPLICATE_TYPE_SEQUENCING); @@ -144,7 +141,7 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil } else { read.setIsDuplicate(false); if (markUnmappedMates || !read.isUnmapped()) { - int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), MARKDUPLICATES_NO_OPTICAL_MARKER); + int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), NO_OPTICAL_MARKER); if (dupCount > -1) { ((SAMRecordToGATKReadAdapter) read).setTransientAttribute(MarkDuplicatesSparkUtils.OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME, dupCount); } @@ -228,7 +225,7 @@ public int getPartition(Object key) { protected void runTool(final JavaSparkContext ctx) { JavaRDD reads = getReads(); final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? - new SerializableOpticalDuplicatesFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE) : null; + new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null; // If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag. if (removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) { markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java index 28df09c968e..6ed2d7601dd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java @@ -387,7 +387,7 @@ private static Map,Integer>> handlePassthroughs(List passthroughs) { // Emit the passthrough reads as non-duplicates. return passthroughs.stream() - .map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_NO_OPTICAL_MARKER)) + .map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.NO_OPTICAL_MARKER)) .collect(Collectors.toList()); } @@ -426,7 +426,7 @@ private static int countOpticalDuplicates(OpticalDuplicateFinder finder, List(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER)); + opticalDuplicateList.add(new Tuple2<>(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.OPTICAL_DUPLICATE_MARKER)); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java b/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java index 79547661240..823e748fa66 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/validation/CompareReads.java @@ -17,18 +17,18 @@ @DocumentedFeature @CommandLineProgramProperties( - summary = "Compares the base qualities of two SAM/BAM/CRAM files. The reads in the two files must have " + - "exactly the same names and appear in the same order.", + summary = "Compares the base qualities, cigars, alignment information, and samflags of reads between two SAM/BAM/CRAM files." + + " The reads in the two files must have exactly the same names and appear in the same order.", oneLineSummary = "Compares the base qualities of two SAM/BAM/CRAM files", programGroup = DiagnosticsAndQCProgramGroup.class ) public class CompareReads extends GATKTool { - @Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the first specified file", - shortName = "I1", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = true) + @Argument(doc = "The first sam file against which to compare equality", + shortName = "I1", fullName = "input1", optional = false) protected String input1; - @Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the second specified input file", - shortName = "I2", fullName = "output2", optional = true) + @Argument(doc = "The second sam file against which to compare equality", + shortName = "I2", fullName = "input2", optional = false) protected String input2; @@ -36,8 +36,8 @@ public class CompareReads extends GATKTool { public void traverse() { List errorMessages = new ArrayList<>(); - try( ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1)); - ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) { + try(ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1)); + ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) { final Iterator it1 = reads1.iterator(); final Iterator it2 = reads2.iterator(); while (it1.hasNext() && it2.hasNext()) { diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SerializableOpticalDuplicatesFinder.java b/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SerializableOpticalDuplicatesFinder.java deleted file mode 100644 index 5e390c3a3c9..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/markduplicates/SerializableOpticalDuplicatesFinder.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.broadinstitute.hellbender.utils.read.markduplicates; - -import com.esotericsoftware.kryo.Kryo; -import com.esotericsoftware.kryo.io.Input; -import com.esotericsoftware.kryo.io.Output; -import org.broadinstitute.hellbender.tools.spark.pathseq.PSKmerBloomFilter; -import picard.sam.markduplicates.util.OpticalDuplicateFinder; - -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.Serializable; - -/** - * An intermediate class flagged as being serializable so that the Picard OpticalDuplicateFinder can be serialized despite - * it not being marked as such. Since there is no internal state beyond initialization settings in the picard OpticalDuplicates - * finder this should not cause serialization problems. - */ -public class SerializableOpticalDuplicatesFinder extends OpticalDuplicateFinder implements Serializable{ - private static final long serialVersionUID = 1L; - - public SerializableOpticalDuplicatesFinder(String read_name_regex, int optical_duplicate_pixel_distance) { - super(read_name_regex, optical_duplicate_pixel_distance, null); - } - - public SerializableOpticalDuplicatesFinder() { - super(); - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java index 1e49bb7d3c1..312855b194e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/MarkDuplicatesSparkIntegrationTest.java @@ -103,9 +103,8 @@ public void testMarkDuplicatesSparkIntegrationTestLocal( Map> metricsExpected) throws IOException { ArgumentsBuilder args = new ArgumentsBuilder(); - args.add("--"+ StandardArgumentDefinitions.INPUT_LONG_NAME); - args.add(input.getPath()); - args.add("--"+StandardArgumentDefinitions.OUTPUT_LONG_NAME); + args.addArgument(StandardArgumentDefinitions.INPUT_LONG_NAME, input.getPath()); + args.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME); File outputFile = createTempFile("markdups", ".bam"); outputFile.delete(); @@ -140,21 +139,15 @@ public void testMarkDuplicatesSparkIntegrationTestLocal( } catch (final FileNotFoundException ex) { System.err.println("Metrics file not found: " + ex); } - final List nonEmptyMetrics = metricsOutput.getMetrics().stream().filter( - metric -> - metric.UNPAIRED_READS_EXAMINED != 0L || - metric.READ_PAIRS_EXAMINED != 0L || - metric.UNMAPPED_READS != 0L || - metric.UNPAIRED_READ_DUPLICATES != 0L || - metric.READ_PAIR_DUPLICATES != 0L || - metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || - (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || - (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) - ).collect(Collectors.toList()); + final List nonEmptyMetrics = getGatkDuplicationMetrics(metricsOutput); Assert.assertEquals(nonEmptyMetrics.size(), metricsExpected.size(), "Wrong number of metrics with non-zero fields."); - for (int i = 0; i < nonEmptyMetrics.size(); i++ ){ + compareMetricsToExpected(metricsExpected, nonEmptyMetrics); + } + + protected void compareMetricsToExpected(Map> metricsExpected, List nonEmptyMetrics) { + for (int i = 0; i < nonEmptyMetrics.size(); i++) { final GATKDuplicationMetrics observedMetrics = nonEmptyMetrics.get(i); List expectedList = metricsExpected.get(observedMetrics.LIBRARY); Assert.assertNotNull(expectedList, "Unexpected library found: " + observedMetrics.LIBRARY); @@ -168,12 +161,27 @@ public void testMarkDuplicatesSparkIntegrationTestLocal( //Note: IntelliJ does not like it when a parameter for a test is null (can't print it and skips the test) //so we work around it by passing in an 'expected 0L' and only comparing to it if the actual value is non-null - if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long)expectedList.get(7) != 0L) { + if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long) expectedList.get(7) != 0L) { Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedList.get(7)); } } } + + protected List getGatkDuplicationMetrics(MetricsFile> metricsOutput) { + return metricsOutput.getMetrics().stream().filter( + metric -> + metric.UNPAIRED_READS_EXAMINED != 0L || + metric.READ_PAIRS_EXAMINED != 0L || + metric.UNMAPPED_READS != 0L || + metric.UNPAIRED_READ_DUPLICATES != 0L || + metric.READ_PAIR_DUPLICATES != 0L || + metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || + (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || + (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) + ).collect(Collectors.toList()); + } + @Test( dataProvider = "md") public void testMarkDuplicatesSparkMarkingOpticalDuplicatesWithTagging( final File input, final long totalExpected, final long dupsExpected, @@ -207,17 +215,7 @@ public void testMarkDuplicatesSparkMarkingOpticalDuplicatesWithTagging( } catch (final FileNotFoundException ex) { System.err.println("Metrics file not found: " + ex); } - final List nonEmptyMetrics = metricsOutput.getMetrics().stream().filter( - metric -> - metric.UNPAIRED_READS_EXAMINED != 0L || - metric.READ_PAIRS_EXAMINED != 0L || - metric.UNMAPPED_READS != 0L || - metric.UNPAIRED_READ_DUPLICATES != 0L || - metric.READ_PAIR_DUPLICATES != 0L || - metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || - (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || - (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) - ).collect(Collectors.toList()); + final List nonEmptyMetrics = getGatkDuplicationMetrics(metricsOutput); int totalReads = 0; @@ -282,17 +280,7 @@ public void testMarkDuplicatesSparkMarkingAllDuplicatesWithTagging( } catch (final FileNotFoundException ex) { System.err.println("Metrics file not found: " + ex); } - final List nonEmptyMetrics = metricsOutput.getMetrics().stream().filter( - metric -> - metric.UNPAIRED_READS_EXAMINED != 0L || - metric.READ_PAIRS_EXAMINED != 0L || - metric.UNMAPPED_READS != 0L || - metric.UNPAIRED_READ_DUPLICATES != 0L || - metric.READ_PAIR_DUPLICATES != 0L || - metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || - (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || - (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) - ).collect(Collectors.toList()); + final List nonEmptyMetrics = getGatkDuplicationMetrics(metricsOutput); int totalReads = 0; @@ -374,41 +362,15 @@ public void testMarkDuplicatesSparkDeletingDuplicateReads( } catch (final FileNotFoundException ex) { System.err.println("Metrics file not found: " + ex); } - final List nonEmptyMetrics = metricsOutput.getMetrics().stream().filter( - metric -> - metric.UNPAIRED_READS_EXAMINED != 0L || - metric.READ_PAIRS_EXAMINED != 0L || - metric.UNMAPPED_READS != 0L || - metric.UNPAIRED_READ_DUPLICATES != 0L || - metric.READ_PAIR_DUPLICATES != 0L || - metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || - (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || - (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) - ).collect(Collectors.toList()); + final List nonEmptyMetrics = getGatkDuplicationMetrics(metricsOutput); // Assert that the metrics haven't changed at all Assert.assertEquals(nonEmptyMetrics.size(), metricsExpected.size(), "Wrong number of metrics with non-zero fields."); - for (int i = 0; i < nonEmptyMetrics.size(); i++ ){ - final GATKDuplicationMetrics observedMetrics = nonEmptyMetrics.get(i); - List expectedList = metricsExpected.get(observedMetrics.LIBRARY); - Assert.assertNotNull(expectedList, "Unexpected library found: " + observedMetrics.LIBRARY); - Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedList.get(0)); - Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedList.get(1)); - Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedList.get(2)); - Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedList.get(3)); - Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedList.get(4)); - Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedList.get(5)); - Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedList.get(6)); - - //Note: IntelliJ does not like it when a parameter for a test is null (can't print it and skips the test) - //so we work around it by passing in an 'expected 0L' and only comparing to it if the actual value is non-null - if (observedMetrics.ESTIMATED_LIBRARY_SIZE != null && (Long)expectedList.get(7) != 0L) { - Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedList.get(7)); - } - } + compareMetricsToExpected(metricsExpected, nonEmptyMetrics); } + @Test( dataProvider = "md") public void testMarkDuplicatesSparkDeletingOpticalDuplicateReads( final File input, final long totalExpected, final long dupsExpected, @@ -451,17 +413,7 @@ public void testMarkDuplicatesSparkDeletingOpticalDuplicateReads( } catch (final FileNotFoundException ex) { System.err.println("Metrics file not found: " + ex); } - final List nonEmptyMetrics = metricsOutput.getMetrics().stream().filter( - metric -> - metric.UNPAIRED_READS_EXAMINED != 0L || - metric.READ_PAIRS_EXAMINED != 0L || - metric.UNMAPPED_READS != 0L || - metric.UNPAIRED_READ_DUPLICATES != 0L || - metric.READ_PAIR_DUPLICATES != 0L || - metric.READ_PAIR_OPTICAL_DUPLICATES != 0L || - (metric.PERCENT_DUPLICATION != null && metric.PERCENT_DUPLICATION != 0.0 && !Double.isNaN(metric.PERCENT_DUPLICATION)) || - (metric.ESTIMATED_LIBRARY_SIZE != null && metric.ESTIMATED_LIBRARY_SIZE != 0L) - ).collect(Collectors.toList()); + final List nonEmptyMetrics = getGatkDuplicationMetrics(metricsOutput); // Asserting int expectedOpticalDuplicatesGroups = 0; @@ -470,7 +422,7 @@ public void testMarkDuplicatesSparkDeletingOpticalDuplicateReads( List expectedList = metricsExpected.get(observedMetrics.LIBRARY); expectedOpticalDuplicatesGroups += (Long) expectedList.get(5); } - // NOTE: this test will fail if we add a more comprehensive example set with optical duplicates containing secondary/supplementary reads + // NOTE: this test will fail if we add a more comprehensive example set with optical duplicates containing secondary/supplementary reads because of how the test is counting optical duplicate reads. Assert.assertEquals(totalReads, totalExpected - expectedOpticalDuplicatesGroups*2, "Wrong number of reads in output BAM"); Assert.assertEquals(duplicateReads, dupsExpected - expectedOpticalDuplicatesGroups*2, "Wrong number of duplicate reads in output BAM"); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUnitTest.java index 3f84fc3549a..2bf22b43791 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUnitTest.java @@ -8,7 +8,6 @@ import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import picard.sam.markduplicates.MarkDuplicates; import picard.sam.markduplicates.util.OpticalDuplicateFinder; import org.broadinstitute.hellbender.GATKBaseTest; @@ -41,7 +40,7 @@ public void markDupesTest(final String input, final long totalExpected, final lo SAMFileHeader header = readSource.getHeader(input, null); OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection(); final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? - new SerializableOpticalDuplicatesFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE) : null; + new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null; JavaRDD markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1, false, MarkDuplicates.DuplicateTaggingPolicy.DontTag); Assert.assertEquals(markedReads.count(), totalExpected); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java index abc38cc36ef..63cc3282e8c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java @@ -18,7 +18,6 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; -import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder; import org.testng.Assert; import org.testng.annotations.Test; import picard.sam.markduplicates.MarkDuplicates; @@ -125,8 +124,8 @@ public void testSortOrderPartitioningCorrectness() throws IOException { sortedHeader.setSortOrder(SAMFileHeader.SortOrder.queryname); // Using the header flagged as unsorted will result in the reads being sorted again - JavaRDD unsortedReadsMarked = MarkDuplicatesSpark.mark(unsortedReads,unsortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new SerializableOpticalDuplicatesFinder(),100,true,MarkDuplicates.DuplicateTaggingPolicy.DontTag); - JavaRDD sortedReadsMarked = MarkDuplicatesSpark.mark(pariedEndsQueryGrouped,sortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new SerializableOpticalDuplicatesFinder(),1, true, MarkDuplicates.DuplicateTaggingPolicy.DontTag); + JavaRDD unsortedReadsMarked = MarkDuplicatesSpark.mark(unsortedReads,unsortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new OpticalDuplicateFinder(),100,true,MarkDuplicates.DuplicateTaggingPolicy.DontTag); + JavaRDD sortedReadsMarked = MarkDuplicatesSpark.mark(pariedEndsQueryGrouped,sortedHeader, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES,new OpticalDuplicateFinder(),1, true, MarkDuplicates.DuplicateTaggingPolicy.DontTag); Iterator sortedReadsFinal = sortedReadsMarked.sortBy(GATKRead::commonToString, false, 1).collect().iterator(); Iterator unsortedReadsFinal = unsortedReadsMarked.sortBy(GATKRead::commonToString, false, 1).collect().iterator(); @@ -172,9 +171,9 @@ public void testChangingContigsOnHeaderlessSAMRecord() { final SparkConf conf = new SparkConf().set("spark.kryo.registrator", "org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtilsUnitTest$TestGATKRegistrator"); final SAMRecord read = ((SAMRecordToGATKReadAdapter) ArtificialReadUtils.createHeaderlessSamBackedRead("read1", "1", 100, 50)).getEncapsulatedSamRecord(); - final SerializableOpticalDuplicatesFinder finder = new SerializableOpticalDuplicatesFinder(OpticalDuplicateFinder.DEFAULT_READ_NAME_REGEX,2500); + final OpticalDuplicateFinder finder = new OpticalDuplicateFinder(OpticalDuplicateFinder.DEFAULT_READ_NAME_REGEX,2500, null); - final SerializableOpticalDuplicatesFinder roundTrippedRead = SparkTestUtils.roundTripInKryo(finder, SerializableOpticalDuplicatesFinder.class, conf); + final OpticalDuplicateFinder roundTrippedRead = SparkTestUtils.roundTripInKryo(finder, OpticalDuplicateFinder.class, conf); Assert.assertEquals(roundTrippedRead.opticalDuplicatePixelDistance, finder.opticalDuplicatePixelDistance); } diff --git a/src/testUtils/java/org/broadinstitute/hellbender/testutils/ArgumentsBuilder.java b/src/testUtils/java/org/broadinstitute/hellbender/testutils/ArgumentsBuilder.java index f39c86b170e..f01cc05e91e 100644 --- a/src/testUtils/java/org/broadinstitute/hellbender/testutils/ArgumentsBuilder.java +++ b/src/testUtils/java/org/broadinstitute/hellbender/testutils/ArgumentsBuilder.java @@ -124,6 +124,15 @@ public ArgumentsBuilder addArgument(final String argumentName, final String argu return this; } + /** + * add an argument with a given value to this builder + */ + public ArgumentsBuilder addArgument(final String argumentName) { + Utils.nonNull(argumentName); + add("--" + argumentName); + return this; + } + /** * Add an argument with a given value to this builder without splitting the value string into multiple arguments at whitespace. * This is specifically to allow arguments with values that contain whitespace.