forked from gatk-workflows/gatk4-germline-snps-indels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
haplotypecaller-gvcf-gatk4.wdl
245 lines (210 loc) · 7.31 KB
/
haplotypecaller-gvcf-gatk4.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
## Copyright Broad Institute, 2019
##
## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool
## from GATK4 in GVCF mode on a single sample according to GATK Best Practices.
## When executed the workflow scatters the HaplotypeCaller tool over a sample
## using an intervals list file. The output file produced will be a
## single gvcf file which can be used by the joint-discovery workflow.
##
## Requirements/expectations :
## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
## - Set of variant calling intervals lists for the scatter, provided in a file
##
## Outputs :
## - One GVCF file and its index
##
## Cromwell version support
## - Successfully tested on v37
## - Does not work on versions < v23 due to output syntax
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the dockers
## for detailed licensing information pertaining to the included programs.
# WORKFLOW DEFINITION
workflow HaplotypeCallerGvcf_GATK4 {
File input_bam
File input_bam_index
File ref_dict
File ref_fasta
File ref_fasta_index
File scattered_calling_intervals_list
Boolean? make_gvcf
Boolean making_gvcf = select_first([make_gvcf,true])
String? gatk_docker_override
String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"])
String? gatk_path_override
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
String? gitc_docker_override
String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
String? samtools_path_override
String samtools_path = select_first([samtools_path_override, "samtools"])
Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
#is the input a cram file?
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam")
String vcf_basename = sample_basename
String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
String output_filename = vcf_basename + output_suffix
if ( is_cram ) {
call CramToBamTask {
input:
input_cram = input_bam,
sample_name = sample_basename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
docker = gitc_docker,
samtools_path = samtools_path
}
}
# Call variants in parallel over grouped calling intervals
scatter (interval_file in scattered_calling_intervals) {
# Generate GVCF by interval
call HaplotypeCaller {
input:
input_bam = select_first([CramToBamTask.output_bam, input_bam]),
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
interval_list = interval_file,
output_filename = output_filename,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
make_gvcf = making_gvcf,
docker = gatk_docker,
gatk_path = gatk_path
}
}
# Merge per-interval GVCFs
call MergeGVCFs {
input:
input_vcfs = HaplotypeCaller.output_vcf,
input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
output_filename = output_filename,
docker = gatk_docker,
gatk_path = gatk_path
}
# Outputs that will be retained when execution is complete
output {
File output_vcf = MergeGVCFs.output_vcf
File output_vcf_index = MergeGVCFs.output_vcf_index
}
}
# TASK DEFINITIONS
task CramToBamTask {
# Command parameters
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name
# Runtime parameters
String docker
Int? machine_mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts
String samtools_path
Float output_bam_size = size(input_cram, "GB") / 0.60
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
command {
set -e
set -o pipefail
${samtools_path} view -h -T ${ref_fasta} ${input_cram} |
${samtools_path} view -b -o ${sample_name}.bam -
${samtools_path} index -b ${sample_name}.bam
mv ${sample_name}.bam.bai ${sample_name}.bai
}
runtime {
docker: docker
memory: select_first([machine_mem_gb, 15]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_bam = "${sample_name}.bam"
File output_bai = "${sample_name}.bai"
}
}
# HaplotypeCaller per-sample in GVCF mode
task HaplotypeCaller {
File input_bam
File input_bam_index
File interval_list
String output_filename
File ref_dict
File ref_fasta
File ref_fasta_index
Float? contamination
Boolean make_gvcf
String gatk_path
String? java_options
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts
Int machine_mem_gb = select_first([mem_gb, 7])
Int command_mem_gb = machine_mem_gb - 1
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
command <<<
set -e
${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-L ${interval_list} \
-O ${output_filename} \
-contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
>>>
runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}
# Merge GVCFs generated per-interval for the same sample
task MergeGVCFs {
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String output_filename
String gatk_path
# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? preemptible_attempts
Int machine_mem_gb = select_first([mem_gb, 3])
Int command_mem_gb = machine_mem_gb - 1
command <<<
set -e
${gatk_path} --java-options "-Xmx${command_mem_gb}G" \
MergeVcfs \
--INPUT ${sep=' --INPUT ' input_vcfs} \
--OUTPUT ${output_filename}
>>>
runtime {
docker: docker
memory: machine_mem_gb + " GB"
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
}
output {
File output_vcf = "${output_filename}"
File output_vcf_index = "${output_filename}.tbi"
}
}