forked from CRG-CNAG/CalliNGS-NF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
497 lines (398 loc) · 12.3 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
/*
* Copyright (c) 2017-2018, Centre for Genomic Regulation (CRG).
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This Source Code Form is "Incompatible With Secondary Licenses", as
* defined by the Mozilla Public License, v. 2.0.
*/
/*
* 'CalliNGS-NF' - A Nextflow pipeline for variant calling with NGS data
*
* This pipeline that reproduces steps from the GATK best practics of SNP
* calling with RNAseq data procedure:
* https://software.broadinstitute.org/gatk/guide/article?id=3891
*
* Anna Vlasova
* Emilio Palumbo
* Paolo Di Tommaso
* Evan Floden
*/
/*
* Define the default parameters
*/
params.genome = "$baseDir/data/genome.fa"
params.variants = "$baseDir/data/known_variants.vcf.gz"
params.blacklist = "$baseDir/data/blacklist.bed"
params.reads = "$baseDir/data/reads/rep1_{1,2}.fq.gz"
params.results = "results"
params.gatk = '/usr/local/bin/GenomeAnalysisTK.jar'
params.gatk_launch = "java -jar $params.gatk"
log.info """\
C A L L I N G S - N F v 1.0
================================
genome : $params.genome
reads : $params.reads
variants : $params.variants
blacklist: $params.blacklist
results : $params.results
gatk : $params.gatk
"""
/*
* Defines the reads channel and the shortcut for the GATK app path
*/
GATK = params.gatk_launch
reads_ch = Channel.fromFilePairs(params.reads)
/**********
* PART 1: Data preparation
*
* Process 1A: Create a FASTA genome index (.fai) with samtools for GATK
*/
process '1A_prepare_genome_samtools' {
tag "$genome.baseName"
input:
path genome from params.genome
output:
path "${genome}.fai" into genome_index_ch
script:
"""
samtools faidx ${genome}
"""
}
/*
* Process 1B: Create a FASTA genome sequence dictionary with Picard for GATK
*/
process '1B_prepare_genome_picard' {
tag "$genome.baseName"
label 'mem_xlarge'
input:
path genome from params.genome
output:
path "${genome.baseName}.dict" into genome_dict_ch
script:
"""
PICARD=`which picard.jar`
java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict
"""
}
/*
* Process 1C: Create STAR genome index file.
*/
process '1C_prepare_star_genome_index' {
tag "$genome.baseName"
input:
path genome from params.genome
output:
path "genome_dir" into genome_dir_ch
script:
"""
mkdir genome_dir
STAR --runMode genomeGenerate \
--genomeDir genome_dir \
--genomeFastaFiles ${genome} \
--runThreadN ${task.cpus}
"""
}
/*
* Process 1D: Create a file containing the filtered and recoded set of variants
*/
process '1D_prepare_vcf_file' {
tag "$variantsFile.baseName"
input:
path variantsFile from params.variants
path blacklisted from params.blacklist
output:
tuple \
path("${variantsFile.baseName}.filtered.recode.vcf.gz"), \
path("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") into prepared_vcf_ch
script:
"""
vcftools --gzvcf $variantsFile -c \
--exclude-bed ${blacklisted} \
--recode | bgzip -c \
> ${variantsFile.baseName}.filtered.recode.vcf.gz
tabix ${variantsFile.baseName}.filtered.recode.vcf.gz
"""
}
/*
* END OF PART 1
*********/
/**********
* PART 2: STAR RNA-Seq Mapping
*
* Process 2: Align RNA-Seq reads to the genome with STAR
*/
process '2_rnaseq_mapping_star' {
tag "$replicateId"
input:
path genome from params.genome
path genomeDir from genome_dir_ch
tuple val(replicateId), path(reads) from reads_ch
output:
tuple \
val(replicateId), \
path('Aligned.sortedByCoord.out.bam'), \
path('Aligned.sortedByCoord.out.bam.bai') into aligned_bam_ch
script:
"""
# ngs-nf-dev Align reads to genome
STAR --genomeDir $genomeDir \
--readFilesIn $reads \
--runThreadN ${task.cpus} \
--readFilesCommand zcat \
--outFilterType BySJout \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999
# 2nd pass (improve alignmets using table of splice junctions and create a new index)
mkdir genomeDir
STAR --runMode genomeGenerate \
--genomeDir genomeDir \
--genomeFastaFiles $genome \
--sjdbFileChrStartEnd SJ.out.tab \
--sjdbOverhang 75 \
--runThreadN ${task.cpus}
# Final read alignments
STAR --genomeDir genomeDir \
--readFilesIn $reads \
--runThreadN ${task.cpus} \
--readFilesCommand zcat \
--outFilterType BySJout \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999 \
--outSAMtype BAM SortedByCoordinate \
--outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878
# Index the BAM file
samtools index Aligned.sortedByCoord.out.bam
"""
}
/*
* END OF PART 2
******/
/**********
* PART 3: GATK Prepare Mapped Reads
*
* Process 3: Split reads that contain Ns in their CIGAR string.
* Creates k+1 new reads (where k is the number of N cigar elements)
* that correspond to the segments of the original read beside/between
* the splicing events represented by the Ns in the original CIGAR.
*/
process '3_rnaseq_gatk_splitNcigar' {
tag "$replicateId"
label 'mem_large'
input:
path genome from params.genome
path index from genome_index_ch
path genome_dict from genome_dict_ch
tuple val(replicateId), path(bam), path(index) from aligned_bam_ch
output:
tuple val(replicateId), path('split.bam'), path('split.bai') into splitted_bam_ch
script:
"""
# SplitNCigarReads and reassign mapping qualities
$GATK -T SplitNCigarReads \
-R $genome -I $bam \
-o split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 -RMQT 60 \
-U ALLOW_N_CIGAR_READS \
--fix_misencoded_quality_scores
"""
}
/*
* END OF PART 3
******/
/***********
* PART 4: GATK Base Quality Score Recalibration Workflow
*
* Process 4: Base recalibrate to detect systematic errors in base quality scores,
* select unique alignments and index
*
*/
process '4_rnaseq_gatk_recalibrate' {
tag "$replicateId"
label 'mem_large'
input:
path genome from params.genome
path index from genome_index_ch
path dict from genome_dict_ch
tuple val(replicateId), path(bam), path(index) from splitted_bam_ch
tuple path(variants_file), path(variants_file_index) from prepared_vcf_ch
output:
tuple \
val(sampleId), \
path("${replicateId}.final.uniq.bam"), \
path("${replicateId}.final.uniq.bam.bai") into (final_output_ch, bam_for_ASE_ch)
script:
sampleId = replicateId.replaceAll(/[12]$/,'')
"""
# Indel Realignment and Base Recalibration
$GATK -T BaseRecalibrator \
--default_platform illumina \
-cov ReadGroupCovariate \
-cov QualityScoreCovariate \
-cov CycleCovariate \
-knownSites ${variants_file} \
-cov ContextCovariate \
-R ${genome} -I ${bam} \
--downsampling_type NONE \
-nct ${task.cpus} \
-o final.rnaseq.grp
$GATK -T PrintReads \
-R ${genome} -I ${bam} \
-BQSR final.rnaseq.grp \
-nct ${task.cpus} \
-o final.bam
# Select only unique alignments, no multimaps
(samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \
|samtools view -Sb - > ${replicateId}.final.uniq.bam
# Index BAM files
samtools index ${replicateId}.final.uniq.bam
"""
}
/*
* END OF PART 4
******/
/***********
* PART 5: GATK Variant Calling
*
* Process 5: Call variants with GATK HaplotypeCaller.
* Calls SNPs and indels simultaneously via local de-novo assembly of
* haplotypes in an active region.
* Filter called variants with GATK VariantFiltration.
*/
process '5_rnaseq_call_variants' {
tag "$sampleId"
label 'mem_large'
input:
path genome from params.genome
path index from genome_index_ch
path dict from genome_dict_ch
tuple val(sampleId), path(bam), path(bai) from final_output_ch.groupTuple()
output:
tuple val(sampleId), path('final.vcf') into vcf_files
script:
"""
# fix absolute path in dict file
sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict
echo "${bam.join('\n')}" > bam.list
# Variant calling
$GATK -T HaplotypeCaller \
-R $genome -I bam.list \
-dontUseSoftClippedBases \
-stand_call_conf 20.0 \
-o output.gatk.vcf.gz
# Variant filtering
$GATK -T VariantFiltration \
-R $genome -V output.gatk.vcf.gz \
-window 35 -cluster 3 \
-filterName FS -filter "FS > 30.0" \
-filterName QD -filter "QD < 2.0" \
-o final.vcf
"""
}
/*
* END OF PART 5
******/
/***********
* PART 6: Post-process variants file and prepare for Allele-Specific Expression and RNA Editing Analysis
*
* Process 6A: Post-process the VCF result
*/
process '6A_post_process_vcf' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
input:
tuple val(sampleId), path('final.vcf') from vcf_files
tuple path('filtered.recode.vcf.gz'), path('filtered.recode.vcf.gz.tbi') from prepared_vcf_ch
output:
tuple val(sampleId), path('final.vcf'), path('commonSNPs.diff.sites_in_files') into vcf_and_snps_ch
script:
'''
grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf
vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz --diff-site --out commonSNPs
'''
}
/*
* Process 6B: Prepare variants file for allele specific expression (ASE) analysis
*/
process '6B_prepare_vcf_for_ase' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
input:
tuple val(sampleId), path('final.vcf'), path('commonSNPs.diff.sites_in_files') from vcf_and_snps_ch
output:
tuple val(sampleId), path('known_snps.vcf') into vcf_for_ASE
path 'AF.histogram.pdf' into gghist_pdfs
script:
'''
awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files > test.bed
vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf
grep -v '#' known_snps.vcf | awk -F '\\t' '{print $10}' \
|awk -F ':' '{print $2}'|perl -ne 'chomp($_); \
@v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\
{print $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \
>AF.4R
gghist.R -i AF.4R -o AF.histogram.pdf
'''
}
/*
* Group data for allele-specific expression.
*
* The `bam_for_ASE_ch` emites tuples having the following structure, holding the final BAM/BAI files:
*
* ( sample_id, file_bam, file_bai )
*
* The `vcf_for_ASE` channel emits tuples having the following structure, holding the VCF file:
*
* ( sample_id, output.vcf )
*
* The BAMs are grouped together and merged with VCFs having the same sample id. Finally
* it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples:
*
* ( sample_id, file_vcf, List[file_bam], List[file_bai] )
*/
bam_for_ASE_ch
.groupTuple()
.phase(vcf_for_ASE)
.map{ left, right ->
def sampleId = left[0]
def bam = left[1]
def bai = left[2]
def vcf = right[1]
tuple(sampleId, vcf, bam, bai)
}
.set { grouped_vcf_bam_bai_ch }
/*
* Process 6C: Allele-Specific Expression analysis with GATK ASEReadCounter.
* Calculates allele counts at a set of positions after applying
* filters that are tuned for enabling allele-specific expression
* (ASE) analysis
*/
process '6C_ASE_knownSNPs' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
label 'mem_large'
input:
path genome from params.genome
path index from genome_index_ch
path dict from genome_dict_ch
tuple val(sampleId), path(vcf), path(bam), path(bai) from grouped_vcf_bam_bai_ch
output:
path "ASE.tsv"
script:
"""
echo "${bam.join('\n')}" > bam.list
$GATK -R ${genome} \
-T ASEReadCounter \
-o ASE.tsv \
-I bam.list \
-sites ${vcf}
"""
}
/*
* END OF PART 6
******/