Skip to content

Commit

Permalink
respnoded to comments without deleting the class
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesemery committed Nov 6, 2018
1 parent ce1b586 commit 298349b
Show file tree
Hide file tree
Showing 11 changed files with 66 additions and 144 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils;
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey;
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*;

import java.util.Collections;
Expand Down Expand Up @@ -93,7 +92,5 @@ private void registerGATKClasses(Kryo kryo) {
kryo.register(MarkDuplicatesSparkUtils.IndexPair.class, new FieldSerializer(kryo, MarkDuplicatesSparkUtils.IndexPair.class));
kryo.register(ReadsKey.class, new FieldSerializer(kryo, ReadsKey.class));
kryo.register(ReadsKey.KeyForFragment.class, new FieldSerializer(kryo, ReadsKey.KeyForFragment.class));
kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class));
kryo.register(SerializableOpticalDuplicatesFinder.class, new FieldSerializer(kryo, SerializableOpticalDuplicatesFinder.class));
}
kryo.register(ReadsKey.KeyForPair.class, new FieldSerializer(kryo, ReadsKey.KeyForPair.class)); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.SequenceDictionaryValidationArgumentCollection;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
Expand All @@ -21,7 +20,6 @@
import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSpark;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;

import java.io.IOException;
Expand Down Expand Up @@ -68,7 +66,7 @@ protected void runTool(final JavaSparkContext ctx) {
try (final BwaSparkEngine bwaEngine = new BwaSparkEngine(ctx, referenceArguments.getReferenceFileName(), bwaArgs.indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) {
final ReadFilter filter = makeReadFilter(bwaEngine.getHeader());
final JavaRDD<GATKRead> alignedReads = bwaEngine.alignPaired(getUnfilteredReads()).filter(filter::test);
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, bwaEngine.getHeader(), new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
try {
ReadsSparkSink.writeReads(ctx, output,
referenceArguments.getReferencePath().toAbsolutePath().toUri().toString(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport;
import org.broadinstitute.hellbender.utils.spark.SparkUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariant;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;

import java.util.Collection;
import java.util.List;
Expand Down Expand Up @@ -174,7 +174,7 @@ protected void runTool(final JavaSparkContext ctx) {
header = getHeaderForReads();
}

final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new SerializableOpticalDuplicatesFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(alignedReads, header, new OpticalDuplicateFinder(), markDuplicatesSparkArgumentCollection, getRecommendedNumReducers());

// always coordinate-sort reads so BQSR can use queryLookaheadBases in FeatureDataSource
final SAMFileHeader readsHeader = header.clone();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.tools.spark.transforms.markduplicates;

import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.metrics.MetricsFile;
import org.apache.spark.Partitioner;
Expand All @@ -25,15 +24,13 @@
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
import org.broadinstitute.hellbender.utils.read.markduplicates.GATKDuplicationMetrics;
import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy;
import org.broadinstitute.hellbender.utils.read.markduplicates.SerializableOpticalDuplicatesFinder;
import org.broadinstitute.hellbender.utils.spark.SparkUtils;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import picard.sam.markduplicates.MarkDuplicates;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import scala.Tuple2;

import java.util.*;
import java.util.stream.Collectors;

@DocumentedFeature
@CommandLineProgramProperties(
Expand Down Expand Up @@ -66,7 +63,7 @@ public final class MarkDuplicatesSpark extends GATKSparkTool {
mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS}, optional = true)
public boolean removeAllDuplicates = false;

@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.",
@Argument(fullName = MarkDuplicatesSparkArgumentCollection.REMOVE_SEQUENCING_DUPLICATE_READS, doc = "If true do not write optical/sequencing duplicates to the output file instead of writing them with appropriate flags set.",
mutex = {MarkDuplicatesSparkArgumentCollection.DUPLICATE_TAGGING_POLICY_LONG_NAME, MarkDuplicatesSparkArgumentCollection.REMOVE_ALL_DUPLICATE_READS}, optional = true)
public boolean removeSequencingDuplicates = false;

Expand All @@ -76,9 +73,9 @@ public List<ReadFilter> getDefaultReadFilters() {
}

// Reads with this marker will be treated as non-duplicates always
public static int MARKDUPLICATES_NO_OPTICAL_MARKER = -1;
public static int NO_OPTICAL_MARKER = -1;
// Reads with this marker will be treated and marked as optical duplicates
public static int MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER = -2;
public static int OPTICAL_DUPLICATE_MARKER = -2;

/**
* Main method for marking duplicates, takes an JavaRDD of GATKRead and an associated SAMFileHeader with corresponding
Expand Down Expand Up @@ -135,16 +132,16 @@ public static JavaRDD<GATKRead> mark(final JavaRDD<GATKRead> reads, final SAMFil
.peek(read -> {
// Handle reads that have been marked as non-duplicates (which also get tagged with optical duplicate summary statistics)
if (namesOfNonDuplicateReadsAndOpticalCounts.containsKey(read.getName())) {
// If its an optical duplicate, mark it.
if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER) {
// If its an optical duplicate, mark it. (Note: we only expect these to exist if optical duplicate marking is on)
if (namesOfNonDuplicateReadsAndOpticalCounts.get(read.getName()) == OPTICAL_DUPLICATE_MARKER) {
read.setIsDuplicate(true);
read.setAttribute(MarkDuplicates.DUPLICATE_TYPE_TAG, MarkDuplicates.DUPLICATE_TYPE_SEQUENCING);

// Otherwise treat it normally as a non-duplicate.
} else {
read.setIsDuplicate(false);
if (markUnmappedMates || !read.isUnmapped()) {
int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), MARKDUPLICATES_NO_OPTICAL_MARKER);
int dupCount = namesOfNonDuplicateReadsAndOpticalCounts.replace(read.getName(), NO_OPTICAL_MARKER);
if (dupCount > -1) {
((SAMRecordToGATKReadAdapter) read).setTransientAttribute(MarkDuplicatesSparkUtils.OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME, dupCount);
}
Expand Down Expand Up @@ -228,7 +225,7 @@ public int getPartition(Object key) {
protected void runTool(final JavaSparkContext ctx) {
JavaRDD<GATKRead> reads = getReads();
final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ?
new SerializableOpticalDuplicatesFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE) : null;
new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
// If we need to remove optical duplicates, set the engine to mark optical duplicates using the DT tag.
if (removeSequencingDuplicates && markDuplicatesSparkArgumentCollection.taggingPolicy == MarkDuplicates.DuplicateTaggingPolicy.DontTag) {
markDuplicatesSparkArgumentCollection.taggingPolicy = MarkDuplicates.DuplicateTaggingPolicy.OpticalOnly;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ private static Map<MarkDuplicatesSparkRecord.Type, List<MarkDuplicatesSparkRecor
private static List<Tuple2<IndexPair<String>,Integer>> handlePassthroughs(List<MarkDuplicatesSparkRecord> passthroughs) {
// Emit the passthrough reads as non-duplicates.
return passthroughs.stream()
.map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_NO_OPTICAL_MARKER))
.map(pair -> new Tuple2<>(new IndexPair<>(pair.getName(), pair.getPartitionIndex()), MarkDuplicatesSpark.NO_OPTICAL_MARKER))
.collect(Collectors.toList());
}

Expand Down Expand Up @@ -426,7 +426,7 @@ private static int countOpticalDuplicates(OpticalDuplicateFinder finder, List<Pa
if (opticalDuplicateFlags[i]) {
numOpticalDuplicates++;
if (opticalDuplicateList != null) {
opticalDuplicateList.add(new Tuple2<>(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.MARKDUPLICATES_OPPTICAL_DUPLICATE_MARKER));
opticalDuplicateList.add(new Tuple2<>(new IndexPair<>(scored.get(i).getName(), scored.get(i).getPartitionIndex()), MarkDuplicatesSpark.OPTICAL_DUPLICATE_MARKER));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,27 @@

@DocumentedFeature
@CommandLineProgramProperties(
summary = "Compares the base qualities of two SAM/BAM/CRAM files. The reads in the two files must have " +
"exactly the same names and appear in the same order.",
summary = "Compares the base qualities, cigars, alignment information, and samflags of reads between two SAM/BAM/CRAM files." +
" The reads in the two files must have exactly the same names and appear in the same order.",
oneLineSummary = "Compares the base qualities of two SAM/BAM/CRAM files",
programGroup = DiagnosticsAndQCProgramGroup.class
)
public class CompareReads extends GATKTool {
@Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the first specified file",
shortName = "I1", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = true)
@Argument(doc = "The first sam file against which to compare equality",
shortName = "I1", fullName = "input1", optional = false)
protected String input1;

@Argument(doc = "If output is given, the tool will return a bam with all the mismatching duplicate groups in the second specified input file",
shortName = "I2", fullName = "output2", optional = true)
@Argument(doc = "The second sam file against which to compare equality",
shortName = "I2", fullName = "input2", optional = false)
protected String input2;


@Override
public void traverse() {
List<String> errorMessages = new ArrayList<>();

try( ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1));
ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) {
try(ReadsDataSource reads1 = new ReadsDataSource(IOUtils.getPath(input1));
ReadsDataSource reads2 = new ReadsDataSource(IOUtils.getPath(input2));) {
final Iterator<GATKRead> it1 = reads1.iterator();
final Iterator<GATKRead> it2 = reads2.iterator();
while (it1.hasNext() && it2.hasNext()) {
Expand Down

This file was deleted.

Loading

0 comments on commit 298349b

Please sign in to comment.