Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GatherPileupSummaries #5599

Merged
merged 1 commit into from
Mar 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 151 additions & 67 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,38 @@ workflow Mutect2 {
}

Int m2_output_size = tumor_bam_size / scatter_count

if (run_ob_mm_filter) {
scatter (subintervals in SplitIntervals.interval_files ) {
call CollectF1R2Counts {
input:
gatk_docker = gatk_docker,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
gatk_override = gatk_override,
disk_space = tumor_bam_size + ref_size + disk_pad,
intervals = subintervals,
extra_intervals = ob_mm_filter_training_intervals,
max_retries = max_retries
}
}

call LearnReadOrientationModel {
input:
alt_tables = CollectF1R2Counts.alt_table,
ref_histograms = CollectF1R2Counts.ref_histograms,
alt_histograms = CollectF1R2Counts.alt_histograms,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries
}
}

scatter (subintervals in SplitIntervals.interval_files ) {
call M2 {
input:
Expand All @@ -225,6 +257,8 @@ workflow Mutect2 {
m2_extra_args = m2_extra_args,
make_bamout = make_bamout_or_default,
artifact_prior_table = LearnReadOrientationModel.artifact_prior_table,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
compress = compress,
gga_vcf = gga_vcf,
gga_vcf_idx = gga_vcf_idx,
Expand Down Expand Up @@ -294,52 +328,44 @@ workflow Mutect2 {
}
}

if (run_ob_mm_filter) {
call CollectF1R2Counts {
if (defined(variants_for_contamination)) {
call MergePileupSummaries as MergeTumorPileups {
input:
gatk_docker = gatk_docker,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
input_tables = M2.tumor_pileups,
output_name = output_basename,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
gatk_override = gatk_override,
disk_space = tumor_bam_size + ref_size + disk_pad,
intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals,
max_retries = max_retries
}

call LearnReadOrientationModel {
input:
alt_table = CollectF1R2Counts.alt_table,
ref_histogram = CollectF1R2Counts.ref_histogram,
alt_histograms = CollectF1R2Counts.alt_histograms,
tumor_sample = CollectF1R2Counts.tumor_sample,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries
max_retries = max_retries,
disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad
}

if (defined(normal_bam)){
call MergePileupSummaries as MergeNormalPileups {
input:
input_tables = M2.normal_pileups,
output_name = output_basename,
ref_dict = ref_dict,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries,
disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad
}
}
}

if (defined(variants_for_contamination)) {
call CalculateContamination {
input:
gatk_override = gatk_override,
intervals = intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
preemptible_attempts = preemptible_attempts,
max_retries = max_retries,
gatk_docker = gatk_docker,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
normal_bam = normal_bam,
normal_bai = normal_bai,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_index = variants_for_contamination_index,
tumor_pileups = MergeTumorPileups.merged_table,
normal_pileups = MergeNormalPileups.merged_table,
disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad
}
}
Expand Down Expand Up @@ -472,6 +498,7 @@ workflow Mutect2 {
File? bamout = MergeBamOuts.merged_bam_out
File? bamout_index = MergeBamOuts.merged_bam_out_index
File? maf_segments = CalculateContamination.maf_segments
File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table
}
}

Expand Down Expand Up @@ -548,6 +575,8 @@ task M2 {
File? gga_vcf
File? gga_vcf_idx
File? artifact_prior_table
File? variants_for_contamination
File? variants_for_contamination_index

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
Expand Down Expand Up @@ -580,7 +609,7 @@ task M2 {
gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode
tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`"

if [[ -f "${normal_bam}" ]]; then
if [[ ! -z "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode
normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`"
fi
Expand All @@ -597,6 +626,21 @@ task M2 {
${true='--bam-output bamout.bam' false='' make_bamout} \
${"--orientation-bias-artifact-priors " + artifact_prior_table} \
${m2_extra_args}

### GetPileupSummaries
# These must be created, even if they remain empty, as cromwell doesn't support optional output
touch tumor-pileups.table
touch normal-pileups.table

if [[ ! -z "${variants_for_contamination}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table

if [[ ! -z "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table
fi
fi
>>>

runtime {
Expand All @@ -615,6 +659,8 @@ task M2 {
File output_bamOut = "bamout.bam"
String tumor_sample = read_string("tumor_name.txt")
String normal_sample = read_string("normal_name.txt")
File tumor_pileups = "tumor-pileups.table"
File normal_pileups = "normal-pileups.table"
}
}

Expand Down Expand Up @@ -722,6 +768,51 @@ task MergeBamOuts {
}
}

task MergePileupSummaries {
# input_tables needs to be optional because GetPileupSummaries is in an if-block
Array[File?] input_tables
String output_name
File? gatk_override
File ref_dict

# runtime
String gatk_docker
Int? mem
Int? preemptible_attempts
Int? max_retries
Int? disk_space
Int? cpu
Boolean use_ssd = false

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 1000

command {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx${command_mem}m" GatherPileupSummaries \
--sequence-dictionary ${ref_dict} \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indent

-I ${sep=' -I ' input_tables} \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double-checking: is this error-free when input tables is empty or not present?

-O ${output_name}.tsv
}

runtime {
docker: gatk_docker
bootDiskSizeGb: 12
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 10])
maxRetries: select_first([max_retries, 3])
cpu: select_first([cpu, 1])
}

output {
File merged_table = "${output_name}.tsv"
}
}

task CollectSequencingArtifactMetrics {
# inputs
File ref_fasta
Expand Down Expand Up @@ -777,6 +868,7 @@ task CollectF1R2Counts {

File? gatk_override
File? intervals
File? extra_intervals

# runtime
Int? max_retries
Expand All @@ -803,9 +895,10 @@ task CollectF1R2Counts {
gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \
-I ${tumor_bam} -R ${ref_fasta} \
${"-L " + intervals} \
-alt-table "$tumor_name-alt.tsv" \
-ref-hist "$tumor_name-ref.metrics" \
-alt-hist "$tumor_name-alt-depth1.metrics"
${"-isr INTERSECTION -L " + extra_intervals} \
--alt-table "$tumor_name-alt.tsv" \
--ref-hist "$tumor_name-ref.metrics" \
--alt-hist "$tumor_name-alt-depth1.metrics"
}

runtime {
Expand All @@ -820,21 +913,20 @@ task CollectF1R2Counts {

output {
File alt_table = glob("*-alt.tsv")[0]
File ref_histogram = glob("*-ref.metrics")[0]
File ref_histograms = glob("*-ref.metrics")[0]
File alt_histograms = glob("*-alt-depth1.metrics")[0]
String tumor_sample = read_string("tumor_name.txt")
}
}

# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018
task LearnReadOrientationModel {
File alt_table
File ref_histogram
File? alt_histograms
Array[File] alt_tables
Array[File] ref_histograms
Array[File?] alt_histograms
Int alt_hist_length = length(alt_histograms)

File? gatk_override
File? intervals
String tumor_sample

# runtime
Int? max_retries
Expand All @@ -853,11 +945,15 @@ task LearnReadOrientationModel {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

if [[ "${alt_hist_length}" > 0 ]]; then
ah_flag="--alt-hist"
fi

gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \
-alt-table ${alt_table} \
-ref-hist ${ref_histogram} \
-alt-hist ${alt_histograms} \
-O "${tumor_sample}-artifact-prior-table.tsv"
--alt-table ${sep=" --alt-table " alt_tables} \
--ref-hist ${sep=" --ref-hist " ref_histograms} \
$ah_flag ${sep=" --alt-hist " alt_histograms} \
-O "artifact-prior-table.tsv"
}

runtime {
Expand All @@ -871,23 +967,18 @@ task LearnReadOrientationModel {
}

output {
File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv"
File artifact_prior_table = "artifact-prior-table.tsv"
}

}

task CalculateContamination {
# inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_bam
File tumor_bai
File? normal_bam
File? normal_bai
File? variants_for_contamination
File? variants_for_contamination_index
File tumor_pileups
File? normal_pileups

File? gatk_override

Expand All @@ -907,16 +998,9 @@ task CalculateContamination {

export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

if [[ -f "${normal_bam}" ]]; then
gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O normal_pileups.table
NORMAL_CMD="-matched normal_pileups.table"
fi

gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \
-V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table
gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table --tumor-segmentation segments.table $NORMAL_CMD
}
gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I ${tumor_pileups} \
-O contamination.table --tumor-segmentation segments.table ${"-matched " + normal_pileups}
}

runtime {
docker: gatk_docker
Expand All @@ -928,7 +1012,6 @@ task CalculateContamination {
}

output {
File pileups = "pileups.table"
File contamination_table = "contamination.table"
File maf_segments = "segments.table"
}
Expand Down Expand Up @@ -968,10 +1051,11 @@ task Filter {
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${"--tumor-segmentation " + maf_segments} \
${m2_extra_filtering_args}
-O ${output_vcf} \
${"--contamination-table " + contamination_table} \
${"--tumor-segmentation " + maf_segments} \
${"-L" + intervals} \
${m2_extra_filtering_args}
}

runtime {
Expand Down
Loading