-
Notifications
You must be signed in to change notification settings - Fork 6
/
shortReadsqc.wdl
287 lines (254 loc) · 7.74 KB
/
shortReadsqc.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# Short reads QC workflow
version 1.0
workflow ShortReadsQC {
input{
String container="bfoster1/img-omics:0.1.9"
String bbtools_container="microbiomedata/bbtools:38.96"
String workflow_container = "microbiomedata/workflowmeta:1.1.1"
String proj
String prefix=sub(proj, ":", "_")
Array[String] input_files
String database="/refdata/"
}
if (length(input_files) > 1) {
call stage_interleave {
input:
container=bbtools_container,
memory="10G",
input_fastq1=input_files[0],
input_fastq2=input_files[1]
}
}
if (length(input_files) == 1) {
call stage_single {
input:
container=container,
input_file = input_files[0]
}
}
# Estimate RQC runtime at an hour per compress GB
call rqcfilter as qc {
input:
input_fastq = if length(input_files) > 1 then stage_interleave.reads_fastq else stage_single.reads_fastq,
threads = "16",
database = database,
memory = "60G",
container = bbtools_container
}
call make_info_file {
input: info_file = qc.info_file,
container = container,
prefix = prefix
}
call finish_rqc {
input: container = workflow_container,
prefix = prefix,
filtered = qc.filtered,
filtered_stats = qc.stat,
filtered_stats2 = qc.stat2
}
output {
File filtered_final = finish_rqc.filtered_final
File filtered_stats_final = finish_rqc.filtered_stats_final
File filtered_stats2_final = finish_rqc.filtered_stats2_final
File rqc_info = make_info_file.rqc_info
}
}
task stage_single {
input{
String container
String target="raw.fastq.gz"
String input_file
}
command <<<
set -oeu pipefail
if [ $( echo ~{input_file}|egrep -c "https*:") -gt 0 ] ; then
wget ~{input_file} -O ~{target}
else
ln -s ~{input_file} ~{target} || cp ~{input_file} ~{target}
fi
# Capture the start time
date --iso-8601=seconds > start.txt
>>>
output{
File reads_fastq = "~{target}"
String start = read_string("start.txt")
}
runtime {
memory: "1 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task stage_interleave {
input{
String container
String memory
String target_reads_1="raw_reads_1.fastq.gz"
String target_reads_2="raw_reads_2.fastq.gz"
String output_interleaved="raw_interleaved.fastq.gz"
String input_fastq1
String input_fastq2
}
command <<<
set -oeu pipefail
if [ $( echo ~{input_fastq1} | egrep -c "https*:") -gt 0 ] ; then
wget ~{input_fastq1} -O ~{target_reads_1}
wget ~{input_fastq2} -O ~{target_reads_2}
else
ln -s ~{input_fastq1} ~{target_reads_1} || cp ~{input_fastq1} ~{target_reads_1}
ln -s ~{input_fastq2} ~{target_reads_2} || cp ~{input_fastq2} ~{target_reads_2}
fi
reformat.sh -Xmx~{memory} in1=~{target_reads_1} in2=~{target_reads_2} out=~{output_interleaved}
# Capture the start time
date --iso-8601=seconds > start.txt
# Validate that the read1 and read2 files are sorted correct to interleave
reformat.sh -Xmx~{memory} verifypaired=t in=~{output_interleaved}
>>>
output{
File reads_fastq = "~{output_interleaved}"
String start = read_string("start.txt")
}
runtime {
memory: "10 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task rqcfilter {
input{
File? input_fastq
String container
String database
String rqcfilterdata = database + "/RQCFilterData"
Boolean chastityfilter_flag=true
String? memory
String? threads
String filename_outlog="stdout.log"
String filename_errlog="stderr.log"
String filename_stat="filtered/filterStats.txt"
String filename_stat2="filtered/filterStats2.txt"
String filename_stat_json="filtered/filterStats.json"
String filename_reproduce="filtered/reproduce.sh"
String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)"
String jvm_threads=select_first([threads,system_cpu])
String chastityfilter= if (chastityfilter_flag) then "cf=t" else "cf=f"
}
runtime {
docker: container
memory: "70 GB"
cpu: 16
}
command<<<
export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P"
set -euo pipefail
rqcfilter2.sh \
~{if (defined(memory)) then "-Xmx" + memory else "-Xmx60G" }\
-da \
threads=~{jvm_threads} \
~{chastityfilter} \
jni=t \
in=~{input_fastq} \
path=filtered \
rna=f \
trimfragadapter=t \
qtrim=r \
trimq=0 \
maxns=3 \
maq=3 \
minlen=51 \
mlf=0.33 \
phix=t \
removehuman=t \
removedog=t \
removecat=t \
removemouse=t \
khist=t \
removemicrobes=t \
sketch \
kapa=t \
clumpify=t \
barcodefilter=f \
trimpolyg=5 \
usejni=f \
rqcfilterdata=~{rqcfilterdata} \
> >(tee -a ~{filename_outlog}) \
2> >(tee -a ~{filename_errlog} >&2)
python <<CODE
import json
f = open("~{filename_stat}",'r')
d = dict()
for line in f:
if not line.rstrip():continue
key,value=line.rstrip().split('=')
d[key]=float(value) if 'Ratio' in key else int(value)
with open("~{filename_stat_json}", 'w') as outfile:
json.dump(d, outfile)
CODE
>>>
output {
File stdout = filename_outlog
File stderr = filename_errlog
File stat = filename_stat
File stat2 = filename_stat2
File info_file = filename_reproduce
File filtered = glob("filtered/*fastq.gz")[0]
File json_out = filename_stat_json
}
}
task make_info_file {
input{
File info_file
String prefix
String container
}
command<<<
set -oeu pipefail
sed -n 2,5p ~{info_file} 2>&1 | \
perl -ne 's:in=/.*/(.*) :in=$1:; s/#//; s/BBTools/BBTools(1)/; print;' > \
~{prefix}_readsQC.info
echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> \
~{prefix}_readsQC.info
>>>
output {
File rqc_info = "~{prefix}_readsQC.info"
}
runtime {
memory: "1 GiB"
cpu: 1
maxRetries: 1
docker: container
}
}
task finish_rqc {
input {
File filtered_stats
File filtered_stats2
File filtered
String container
String prefix
}
command<<<
set -oeu pipefail
end=`date --iso-8601=seconds`
# Generate QA objects
ln -s ~{filtered} ~{prefix}_filtered.fastq.gz
ln -s ~{filtered_stats} ~{prefix}_filterStats.txt
ln -s ~{filtered_stats2} ~{prefix}_filterStats2.txt
# Generate stats but rename some fields untilt the script is fixed.
/scripts/rqcstats.py ~{filtered_stats} > stats.json
cp stats.json ~{prefix}_qa_stats.json
>>>
output {
File filtered_final = "~{prefix}_filtered.fastq.gz"
File filtered_stats_final = "~{prefix}_filterStats.txt"
File filtered_stats2_final = "~{prefix}_filterStats2.txt"
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
}