From bc6b637ff677268823a522ad8fcfbd4c53b41858 Mon Sep 17 00:00:00 2001
From: James <emeryj@broadinstitute.org>
Date: Thu, 24 Jan 2019 11:45:03 -0500
Subject: [PATCH 1/3] removing the beta tag and adding tool docs

---
 .../markduplicates/MarkDuplicatesSpark.java   | 52 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
index 8f84a41fa26..095c52912e7 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
@@ -34,12 +34,62 @@
 
 import java.util.*;
 
+/**
+ * <p>This tool is a Spark implementation of the tool MarkDuplicates in Picard allowing for better utilization
+ *    of available system resources to speed up duplicate marking.</p>
+ *
+ * <p>This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are
+ *    defined as originating from a single fragment of DNA.  Duplicates can arise during sample preparation e.g. library
+ *    construction using PCR.  See also "<a href='https://broadinstitute.github.io/picard/command-line-overview.html#EstimateLibraryComplexity'>EstimateLibraryComplexity</a>" +
+ *    for additional notes on PCR duplication artifacts.  Duplicate reads can also result from a single amplification cluster,
+ *    incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument.  These duplication artifacts are
+ *    referred to as optical duplicates.</p>
+ *
+ * <p>The MarkDuplicates tool works by comparing sequences in the 5 prime positions of both reads and read-pairs in a SAM/BAM file.
+ *    After duplicate reads arecollected, the tool differentiates the primary and duplicate reads using an algorithm that ranks
+ *    reads by the sums of their base-quality scores (default method).</p>
+ *
+ * <p>The tool's main output is a new SAM or BAM file, in which duplicates have been identified in the SAM flags field for each
+ *    read.  Duplicates are marked with the hexadecimal value of 0x0400, which corresponds to a decimal value of 1024.
+ *    If you are not familiar with this type of annotation, please see the following <a href='https://www.broadinstitute.org/gatk/blog?id=7019'>blog post</a> for additional information.</p>" +
+ *
+ * <p>Although the bitwise flag annotation indicates whether a read was marked as a duplicate, it does not identify the type of
+ *    duplicate.  To do this, a new tag called the duplicate type (DT) tag was recently added as an optional output in
+ *    the 'optional field' section of a SAM/BAM file.  Invoking the 'duplicate-tagging-policy' option,
+ *    you can instruct the program to mark all the duplicates (All), only the optical duplicates (OpticalOnly), or no
+ *    duplicates (DontTag).  The records within the output of a SAM/BAM file will have values for the 'DT' tag (depending on the invoked
+ *    'duplicate-tagging-policy'), as either library/PCR-generated duplicates (LB), or sequencing-platform artifact duplicates (SQ).
+ *    This tool uses the 'read-name-regex' and the 'optical-duplicate-pixel-distance' options as the primary methods to identify
+ *    and differentiate duplicate types.  Set read-name-regex' to null to skip optical duplicate detection, e.g. for RNA-seq
+ *    or other data where duplicate sets are extremely large and estimating library complexity is not an aim.
+ *    Note that without optical duplicate counts, library size estimation will be inaccurate.</p>
+ *
+ * <p>MarkDuplicates also produces a metrics file indicating the numbers of duplicates for both single- and paired-end reads.</p>
+ *
+ * <p>The program can take either coordinate-sorted or query-sorted inputs, however it is recommended that the input be
+ *    query-sorted or query-grouped as the tool will have to perform an extra sort operation on the data in order to associate
+ *    reads from the input bam with their mates.</p>
+ *
+ * <p>If desired, duplicates can be removed using the 'remove-all-duplicates' and 'remove-sequencing-duplicates' options.</p>
+ *
+ * <h4>Usage example:</h4>
+ *     <pre>
+ *      gatk MarkDuplicatesSpark \\<br />
+ *            -I input.bam \\<br />
+ *            -O marked_duplicates.bam \\<br />
+ *            -M marked_dup_metrics.txt
+ *     </pre>
+ *
+ *    Please see
+ *    <a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics'>MarkDuplicates</a>
+ *    for detailed explanations of the output metrics.
+ *    <hr />
+ */
 @DocumentedFeature
 @CommandLineProgramProperties(
         summary ="Marks duplicates on spark",
         oneLineSummary ="MarkDuplicates on Spark",
         programGroup = ReadDataManipulationProgramGroup.class)
-@BetaFeature
 public final class MarkDuplicatesSpark extends GATKSparkTool {
     private static final long serialVersionUID = 1L;
 

From 92c417105bd9e027511869ce80cc64fbef877c3e Mon Sep 17 00:00:00 2001
From: James <emeryj@broadinstitute.org>
Date: Fri, 25 Jan 2019 10:28:18 -0500
Subject: [PATCH 2/3] adding more to the docs

---
 .../markduplicates/MarkDuplicatesSpark.java   | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
index 095c52912e7..c8400d13c61 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
@@ -35,8 +35,11 @@
 import java.util.*;
 
 /**
- * <p>This tool is a Spark implementation of the tool MarkDuplicates in Picard allowing for better utilization
- *    of available system resources to speed up duplicate marking.</p>
+ * <p>This is a Spark implementation of the MarkDuplicates tool from Picard that allows the tool to be run in
+ *    parallel on multiple cores on a local machine or multiple machines on a Spark cluster while still matching
+ *    the output of the single-core Picard version. Since the tool requires holding all of the readnames in memory
+ *    while it groups the paired-down read information, it is recommended running this tool on a machine/configuration
+ *    with at least 8 GB of memory for a typical 30x bam.</p>
  *
  * <p>This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are
  *    defined as originating from a single fragment of DNA.  Duplicates can arise during sample preparation e.g. library
@@ -46,7 +49,7 @@
  *    referred to as optical duplicates.</p>
  *
  * <p>The MarkDuplicates tool works by comparing sequences in the 5 prime positions of both reads and read-pairs in a SAM/BAM file.
- *    After duplicate reads arecollected, the tool differentiates the primary and duplicate reads using an algorithm that ranks
+ *    After duplicate reads are collected, the tool differentiates the primary and duplicate reads using an algorithm that ranks
  *    reads by the sums of their base-quality scores (default method).</p>
  *
  * <p>The tool's main output is a new SAM or BAM file, in which duplicates have been identified in the SAM flags field for each
@@ -80,6 +83,19 @@
  *            -M marked_dup_metrics.txt
  *     </pre>
  *
+ *  <h4>MarkDuplicates run on a spark cluster 5 machines</h4>
+ *     <pre>
+ *       gatk MarkDuplicatesSpark \\<br />
+ *            -I input.bam \\<br />
+ *            -O marked_duplicates.bam \\<br />
+ *            -M marked_dup_metrics.txt \\<br />
+ *            -- \\<br />
+ *            --spark-runner SPARK \\<br />
+ *            --spark-master <master_url> \\<br />
+ *            --num-executors 5 \\<br />
+ *            --executor-cores 8 <br />
+ *     </pre>
+ *
  *    Please see
  *    <a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics'>MarkDuplicates</a>
  *    for detailed explanations of the output metrics.

From 0be10b484be25871ccbec989ec651fda13d0827f Mon Sep 17 00:00:00 2001
From: James <emeryj@broadinstitute.org>
Date: Fri, 25 Jan 2019 14:58:14 -0500
Subject: [PATCH 3/3] updated docs again

---
 .../markduplicates/MarkDuplicatesSpark.java         | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
index c8400d13c61..f3dbb2e9943 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java
@@ -38,8 +38,8 @@
  * <p>This is a Spark implementation of the MarkDuplicates tool from Picard that allows the tool to be run in
  *    parallel on multiple cores on a local machine or multiple machines on a Spark cluster while still matching
  *    the output of the single-core Picard version. Since the tool requires holding all of the readnames in memory
- *    while it groups the paired-down read information, it is recommended running this tool on a machine/configuration
- *    with at least 8 GB of memory for a typical 30x bam.</p>
+ *    while it groups the read information, it is recommended running this tool on a machine/configuration
+ *    with at least 8 GB of memory overall for a typical 30x bam.</p>
  *
  * <p>This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are
  *    defined as originating from a single fragment of DNA.  Duplicates can arise during sample preparation e.g. library
@@ -83,6 +83,15 @@
  *            -M marked_dup_metrics.txt
  *     </pre>
  *
+ *  <h4>MarkDuplicates run locally specifying the core input (if 'spark.executor.cores' is unset spark will use all available cores on the machine)</h4>
+ *     <pre>
+ *       gatk MarkDuplicatesSpark \\<br />
+ *            -I input.bam \\<br />
+ *            -O marked_duplicates.bam \\<br />
+ *            -M marked_dup_metrics.txt \\<br />
+ *            --conf 'spark.executor.cores=5'
+ *     </pre>
+ *
  *  <h4>MarkDuplicates run on a spark cluster 5 machines</h4>
  *     <pre>
  *       gatk MarkDuplicatesSpark \\<br />