Skip to content

Commit

Permalink
update the wdl, parallelize LearnReadOrientationModel
Browse files Browse the repository at this point in the history
  • Loading branch information
takutosato committed Jan 31, 2019
1 parent 3756d2a commit bb1b14f
Show file tree
Hide file tree
Showing 6 changed files with 303 additions and 195 deletions.
217 changes: 79 additions & 138 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,38 @@ workflow Mutect2 {
}

Int m2_output_size = tumor_bam_size / scatter_count

if (run_ob_mm_filter) {
scatter (subintervals in SplitIntervals.interval_files ) {
call CollectF1R2Counts {
input:
gatk_docker = gatk_docker,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
gatk_override = gatk_override,
disk_space = tumor_bam_size + ref_size + disk_pad,
intervals = subintervals,
extra_intervals = ob_mm_filter_training_intervals,
max_retries = max_retries
}
}

call LearnReadOrientationModel {
input:
alt_tables = CollectF1R2Counts.alt_table,
ref_histograms = CollectF1R2Counts.ref_histogram,
alt_histograms = CollectF1R2Counts.alt_histograms,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries
}
}

scatter (subintervals in SplitIntervals.interval_files ) {
call M2 {
input:
Expand All @@ -214,6 +246,8 @@ workflow Mutect2 {
m2_extra_args = m2_extra_args,
make_bamout = make_bamout_or_default,
artifact_prior_table = LearnReadOrientationModel.artifact_prior_table,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
compress = compress,
gga_vcf = gga_vcf,
gga_vcf_idx = gga_vcf_idx,
Expand All @@ -222,45 +256,6 @@ workflow Mutect2 {
disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad
}

if (defined(variants_for_contamination)) {
call GetPileupSummaries as TumorPileups {
input:
gatk_override = gatk_override,
intervals = subintervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries,
gatk_docker = gatk_docker,
bam = tumor_bam,
bai = tumor_bai,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
disk_space = tumor_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad
}

if (defined(normal_bam)){
call GetPileupSummaries as NormalPileups {
input:
gatk_override = gatk_override,
intervals = subintervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries,
gatk_docker = gatk_docker,
bam = normal_bam,
bai = normal_bai,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
disk_space = normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad

}
}
}

Float sub_vcf_size = size(M2.unfiltered_vcf, "GB")
Float sub_bamout_size = size(M2.output_bamOut, "GB")
}
Expand Down Expand Up @@ -322,39 +317,10 @@ workflow Mutect2 {
}
}

if (run_ob_mm_filter) {
call CollectF1R2Counts {
input:
gatk_docker = gatk_docker,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
gatk_override = gatk_override,
disk_space = tumor_bam_size + ref_size + disk_pad,
intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals,
max_retries = max_retries
}

call LearnReadOrientationModel {
input:
alt_table = CollectF1R2Counts.alt_table,
ref_histogram = CollectF1R2Counts.ref_histogram,
alt_histograms = CollectF1R2Counts.alt_histograms,
tumor_sample = CollectF1R2Counts.tumor_sample,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries
}
}

if (defined(variants_for_contamination)) {
call MergePileupSummaries as MergeTumorPileups {
input:
input_tables = TumorPileups.pileups,
input_tables = M2.tumor_pileups,
output_name = output_basename,
ref_dict = ref_dict,
gatk_override = gatk_override,
Expand All @@ -367,7 +333,7 @@ workflow Mutect2 {
if (defined(normal_bam)){
call MergePileupSummaries as MergeNormalPileups {
input:
input_tables = NormalPileups.pileups,
input_tables = M2.normal_pileups,
output_name = output_basename,
ref_dict = ref_dict,
gatk_override = gatk_override,
Expand All @@ -381,7 +347,6 @@ workflow Mutect2 {
call CalculateContamination {
input:
gatk_override = gatk_override,
intervals = intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
Expand Down Expand Up @@ -589,7 +554,9 @@ task M2 {
Boolean compress
File? gga_vcf
File? gga_vcf_idx
File? artifact_prior_table
File? artifact_prior_table
File? variants_for_contamination
File? variants_for_contamination_index

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
Expand Down Expand Up @@ -639,6 +606,21 @@ task M2 {
${true='--bam-output bamout.bam' false='' make_bamout} \
${"--orientation-bias-artifact-priors " + artifact_prior_table} \
${m2_extra_args}

### GetPileupSummaries
# These must be created, even if they remain empty, as cromwell doesn't support optional output
touch tumor-pileups.table
touch normal-pileups.table

if [[ -f "${variants_for_contamination}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table

if [[ -f ${normal_bam} ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table
fi
fi
>>>

runtime {
Expand All @@ -657,6 +639,8 @@ task M2 {
File output_bamOut = "bamout.bam"
String tumor_sample = read_string("tumor_name.txt")
String normal_sample = read_string("normal_name.txt")
File tumor_pileups = "tumor-pileups.table"
File normal_pileups = "normal-pileups.table"
}
}

Expand Down Expand Up @@ -764,54 +748,6 @@ task MergeBamOuts {
}
}

task GetPileupSummaries {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File bam
File bai
File? variants_for_contamination
File? variants_for_contamination_index

File? gatk_override

# runtime
Int? preemptible_attempts
Int? max_retries
String gatk_docker
Int? disk_space
Int? mem

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3000
Int command_mem = machine_mem - 500

command {
set -e

export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table
}

runtime {
docker: gatk_docker
bootDiskSizeGb: 12
memory: command_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + " HDD"
preemptible: select_first([preemptible_attempts, 10])
maxRetries: select_first([max_retries, 3])
}

output {
File pileups = "pileups.table"
}

}

task MergePileupSummaries {
# input_tables needs to be optional because GetPileupSummaries is in an if-block
Array[File?] input_tables
Expand Down Expand Up @@ -912,6 +848,7 @@ task CollectF1R2Counts {

File? gatk_override
File? intervals
File? extra_intervals

# runtime
Int? max_retries
Expand All @@ -938,9 +875,10 @@ task CollectF1R2Counts {
gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \
-I ${tumor_bam} -R ${ref_fasta} \
${"-L " + intervals} \
-alt-table "$tumor_name-alt.tsv" \
-ref-hist "$tumor_name-ref.metrics" \
-alt-hist "$tumor_name-alt-depth1.metrics"
${"-isr INTERSECTION -L " + extra_intervals} \
--alt-table "$tumor_name-alt.tsv" \
--ref-hist "$tumor_name-ref.metrics" \
--alt-hist "$tumor_name-alt-depth1.metrics"
}

runtime {
Expand All @@ -963,13 +901,12 @@ task CollectF1R2Counts {

# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018
task LearnReadOrientationModel {
File alt_table
File ref_histogram
File? alt_histograms
Array[File] alt_tables
Array[File] ref_histograms
Array[File?] alt_histograms
Int alt_hist_length = length(alt_histograms)

File? gatk_override
File? intervals
String tumor_sample

# runtime
Int? max_retries
Expand All @@ -988,11 +925,15 @@ task LearnReadOrientationModel {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

if [[ "${alt_hist_length}" > 0 ]]; then
ah_flag="-ah"
fi

gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \
-alt-table ${alt_table} \
-ref-hist ${ref_histogram} \
-alt-hist ${alt_histograms} \
-O "${tumor_sample}-artifact-prior-table.tsv"
-at ${sep=" -at " alt_tables} \
-rh ${sep=" -rh " ref_histograms} \
$ah_flag ${sep=" -ah " alt_histograms} \
-O "artifact-prior-table.tsv"
}

runtime {
Expand All @@ -1006,14 +947,13 @@ task LearnReadOrientationModel {
}

output {
File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv"
File artifact_prior_table = "artifact-prior-table.tsv"
}

}

task CalculateContamination {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
Expand Down Expand Up @@ -1093,10 +1033,11 @@ task Filter {
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${"--tumor-segmentation " + maf_segments} \
${m2_extra_filtering_args}
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${"--tumor-segmentation " + maf_segments} \
${"-L" + intervals} \
${m2_extra_filtering_args}
}

runtime {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import htsjdk.samtools.util.Histogram;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
Expand Down Expand Up @@ -44,13 +44,16 @@
@CommandLineProgramProperties(
summary = "Collect F1R2 read counts for the Mutect2 orientation bias mixture model filter",
oneLineSummary = "Collect F1R2 read counts for the Mutect2 orientation bias mixture model filter",
programGroup = CoverageAnalysisProgramGroup.class
programGroup = ShortVariantDiscoveryProgramGroup.class
)

public class CollectF1R2Counts extends LocusWalker {
public static final String ALT_DATA_TABLE_LONG_NAME = "alt-table";
public static final String ALT_DATA_TABLE_SHORT_NAME = "at";
public static final String ALT_DEPTH1_HISTOGRAM_LONG_NAME = "alt-hist";
public static final String ALT_DEPTH1_HISTOGRAM_SHORT_NAME = "ah";
public static final String REF_SITE_METRICS_LONG_NAME = "ref-hist";
public static final String REF_SITE_METRICS_SHORT_NAME = "rh";
public static final String MIN_MEDIAN_MQ_LONG_NAME = "median-mq";
public static final String MIN_BASE_QUALITY_LONG_NAME = "min-bq";
public static final String MAX_DEPTH_LONG_NAME = "max-depth";
Expand All @@ -61,13 +64,13 @@ public class CollectF1R2Counts extends LocusWalker {
@Argument(fullName = MIN_BASE_QUALITY_LONG_NAME, doc = "exclude bases below this quality from pileup", optional = true)
private int MINIMUM_BASE_QUALITY = 20;

@Argument(fullName = ALT_DATA_TABLE_LONG_NAME, doc = "a tab-separated output table of pileup data over alt sites")
@Argument(shortName = ALT_DATA_TABLE_SHORT_NAME, fullName = ALT_DATA_TABLE_LONG_NAME, doc = "a tab-separated output table of pileup data over alt sites")
private File altDataTable = null;

@Argument(fullName = REF_SITE_METRICS_LONG_NAME, doc = "a metrics file with overall summary metrics and reference context-specific depth histograms")
@Argument(shortName = REF_SITE_METRICS_SHORT_NAME, fullName = REF_SITE_METRICS_LONG_NAME, doc = "a metrics file with overall summary metrics and reference context-specific depth histograms")
private File refMetricsOutput = null;

@Argument(fullName = ALT_DEPTH1_HISTOGRAM_LONG_NAME, doc = "a histogram of alt sites with alt depth = 1")
@Argument(shortName = ALT_DEPTH1_HISTOGRAM_SHORT_NAME, fullName = ALT_DEPTH1_HISTOGRAM_LONG_NAME, doc = "a histogram of alt sites with alt depth = 1")
private File altMetricsOutput = null;

@Argument(fullName = MAX_DEPTH_LONG_NAME, doc = "sites with depth higher than this value will be grouped", optional = true)
Expand Down
Loading

0 comments on commit bb1b14f

Please sign in to comment.