generated from IARCbioinfo/template-nf
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Imputation.nf
601 lines (503 loc) · 23.7 KB
/
Imputation.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
#! /usr/bin/env nextflow
// Copyright (C) 2020 IARC/WHO
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
params.help = null
log.info ""
log.info "--------------------------------------------------------"
log.info " Imputation 1.0 : Pipeline for the imputation of a target dataset against a reference "
log.info "--------------------------------------------------------"
log.info "Copyright (C) IARC/WHO"
log.info "This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE"
log.info "This is free software, and you are welcome to redistribute it"
log.info "under certain conditions; see LICENSE for details."
log.info "--------------------------------------------------------"
log.info ""
if (params.help) {
log.info "--------------------------------------------------------"
log.info " USAGE : nextflow run IARCbioinfo/Imputation-nf -r v1.0 -profile singularity --target my_target --input data/ --output result/ "
log.info "--------------------------------------------------------"
log.info ""
log.info "nextflow run IARCbioinfo/Imputation-nf [-r vX.X -profile singularity] [OPTIONS]"
log.info ""
log.info "Mandatory arguments:"
log.info ""
log.info "--target pattern pattern of the target dataset which do the link with the plink files (.bed/.bim./fam)"
log.info ""
log.info "Optional arguments:"
log.info "--<OPTION> <TYPE> <DESCRIPTION>"
log.info "--input FOLDER Folder where you can find your data to run the pipeline"
log.info "--script FOLDER Folder where you can find the auxiliary scripts of the pipeline"
log.info "--output FOLDER Folder in which the pipeline outputs while be saved"
log.info "--VCFref FOLDER Folder to use as VCF reference"
log.info "--BCFref FOLDER Folder to use as BCF reference"
log.info "--M3VCFref FOLDER Folder to use as M3VCF reference"
log.info "--legend FILE File to use as .legend"
log.info "--fasta FILE File to use as fasta reference"
log.info "--chain FILE File to use as liftover conversion"
log.info "--geno1 FLOAT Value for the first genotyping call rate plink option"
log.info "--geno2 FLOAT Value for the second genotyping call rate plink option"
log.info "--maf FLOAT Minor Allele Frequencie thresold for the data filtering step"
log.info "--pihat FLOAT PI_HAT thresold for the data filtering step"
log.info "--hwe FLOAT Hardy-Weinberg Equilibrium thresold for the data filtering step"
log.info "--conversion [hg38/hg18/hg19] Option to convert data to hg38 version of the genome. Choose 'hg18' to convert your data from hg18 to hg38 or 'hg19' to convert your data from hg19 to hg38. Standard value is 'hg38'."
log.info "--cloud [off/on] Option to run the imputation on cloud Michighan and/or TOPMed server. You have to give your Michighan and/or TOPMed token to run it (Add --token_Michighan and/or --token_TOPMed)."
log.info "--token_Michighan FILE File where you can find your Michighan token as string"
log.info "--token_TOPMed FILE File where you can find your TOPMed token as string"
log.info "--QC_cloud FOLDER Folder where you can find the VCF file imputed from the Michighan or TOPMed server"
log.info ""
log.info "Flags:"
log.info "--<FLAG> <DESCRIPTION>"
log.info ""
exit 0
} else {
/* Software information */
log.info "help: ${params.help}"
}
// -- Option :
params.target = null
params.origin = "hapmap_r23a"
params.geno1 = 0.03
params.geno2 = 0.03
params.maf = 0.01
params.pihat = 0.185
params.hwe = 1e-8
// -- Path :
params.input = null
params.output = null
params.targetDir = params.input+params.target+'/'
params.folder = params.input+'files/'
params.legend = params.folder+'ALL.chr_GRCh38.genotypes.20170504.legend'
legend_file = file( params.legend )
params.fasta = params.folder+'GRCh38_full_analysis_set_plus_decoy_hla.fa'
params.fasta_fai = params.folder+'GRCh38_full_analysis_set_plus_decoy_hla.fa.fai'
params.originDir = params.folder+params.origin+'/'
params.ref=params.folder+"ref/"
params.VCFref = params.ref+"vcf/*"
params.BCFref = params.ref+"bcf/"
params.M3VCFref = params.ref+"m3vcf/"
params.conversion = "hg38"
params.chain = params.folder
params.cloud = "off"
params.token_Michighan = null
params.token_TOPMed = null
params.imputationbot_password = null
params.QC_cloud = null
// -- Pipeline :
//if(params.QC_cloud==null){
process UpdateHG38{
input:
file data from Channel.fromPath(params.targetDir+'*').collect()
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
output:
file ('*-updated.{bed,bim,fam}') into TargetUpdate
file ('*-updated.bim') into TargetQC2
shell:
'''
############################################################################################
## -- 0 : Update version of the tergat dataset : hg18/19 --> hg38
if [ !{params.conversion} != "hg38" ] ; then
awk '{print "chr" $1, $4 -1, $4, $2 }' !{params.target}.bim | sed 's/chr23/chrX/' | sed 's/chr24/chrY/' > dataset.tolift
if [ !{params.conversion} == "hg18" ] ; then
liftOver dataset.tolift !{params.chain}hg18ToHg38.over.chain dataset1 dataset_NCBI36.unMapped
fi
if [ !{params.conversion} == "hg19" ] ; then
liftOver dataset.tolift !{params.chain}hg19ToHg38.over.chain dataset1 dataset_NCBI36.unMapped
fi
awk '{print $4}' dataset1 > dataset1.snps
plink --bfile !{params.target} --extract dataset1.snps --make-bed --out dataset1
mkdir old
mv !{params.target}* old/
awk '{print $4, $3}' dataset1 > dataset1.pos
plink --bfile dataset1 --update-map dataset1.pos --make-bed --out !{params.target}
fi
plink --freq --bfile !{params.target} --allow-no-sex --make-bed --out dataset3
perl HRC-1000G-check-bim-NoReadKey.pl -b !{params.target}.bim -f dataset3.frq -r !{params.legend} -g -x -n
grep -v "real-ref-alleles" Run-plink.sh> Run-plink-update.sh
bash Run-plink-update.sh
'''}
process Admixture{
publishDir params.output+params.target+'/admixture/', mode: 'copy',
saveAs: {filename ->
if (filename == "target4.bim") null
else if (filename == "target4.bed") null
else if (filename == "target4.fam") null
else filename
}
input:
file data from Channel.fromPath(params.folder+'relationships_w_pops_121708.txt').collect()
file data from Channel.fromPath(params.originDir+"*").collect()
file data from TargetUpdate.collect()
output:
file ('target4.{bed,bim,fam}') into Merge
file ('target4.{bed,bim,fam}') into Merge2
file ('out_pop_admixture/') into Admixture
file ('out_pop_admixture/') into Admixture2
file ('admixture_results_withGroups.txt') into Target
shell:
'''
## -- 1 : Retrieve common SNPs between SNPs in the reference list and the target data.
## -- Ref -- ##
awk '{print $2}' !{params.origin}.bim | sort > ref_SNPs.txt
## -- Target -- ##
grep -Fwf AIM_list.txt !{params.target}-updated.bim | awk '{print $2}' > target_SNPs.txt #-updated
plink --bfile !{params.target}-updated --extract target_SNPs.txt --make-bed --out target #-updated
## -- Get common SNPs -- ##
grep -Fwf <(cat ref_SNPs.txt) <(awk '{print $2}' target.bim) > target_common_SNPs.txt
awk -F ":" '{print $1}' target_common_SNPs.txt > ref_common_SNPs.txt
paste target_common_SNPs.txt ref_common_SNPs.txt > change_ID.txt
## -- 2 : Extract the genotypes associated to these common SNPs + merge of the dataset
plink --bfile !{params.origin} --extract target_common_SNPs.txt --make-bed --out ref1
plink --bfile target --extract target_common_SNPs.txt --make-bed --out target1
#plink --bfile target1 --update-map change_ID.txt --update-name --make-bed --out target2
plink --bfile target1 --bmerge ref1 --allow-no-sex --make-bed --out merge1
if [ -e merge1-merge.missnp ]
then
plink --bfile ref1 --exclude merge1-merge.missnp --make-bed --out ref2
plink --bfile target1 --exclude merge1-merge.missnp --make-bed --out target2
plink --bfile ref2 --bmerge target2 --allow-no-sex --make-bed --out merge
else
mv merge1.fam merge.fam
mv merge1.bed merge.bed
mv merge1.bim merge.bim
fi
awk '{print $2}' merge.fam > all_samples.txt
## -- 3 : Associate each reference sample to the correct origin + run admixure
sort -k2 relationships_w_pops_121708.txt > relationships_w_pops_121708_2.txt
awk '{print $2}' !{params.origin}.fam | sort -k1,1 > ref_ind.txt
join -11 -22 ref_ind.txt relationships_w_pops_121708_2.txt | awk '{print $1, $7}' | awk '{if( $2 == "CEU") print $0,1,1; else { if($2 == "YRI") print $0,2,1; else {print $0,3,1}}}' > ind_pop.txt
Rscript !{baseDir}/bin/create_pop_file.r merge.fam ind_pop.txt merge.pop
K=3
admixture --cv merge.bed $K --supervised -j40 | tee log${K}.out
Rscript !{baseDir}/bin/process_admixture.r !{params.target}-updated
############################################################################################
## -- 4 : First filtering step
plink --bfile !{params.target}-updated --geno !{params.geno1} --make-bed --out target3
plink --bfile target3 --maf !{params.maf} --make-bed --out target4
'''}
process Filtering1{
input:
file data from Merge.collect()
file data from Admixture.collect()
val rspop from Channel.from("CEU","CHB_JPT","YRI")
output:
file ('*.{het,imiss.txt,genome,sexcheck}') into QC
file ('target_*.bim') into Bim
file ('target_*.{bed,bim,fam}') into TargetFilter
shell:
'''
pop=!{rspop}
subpop=out_pop_admixture/1000G_checking_$pop/samples_$pop.txt
## -- 5 : Keep only the samples in the ancerstry $pop
plink --bfile target4 --keep ${subpop} --make-bed --out target5_${pop}
## -- 6 : Perform sex checking
plink --bfile target5_${pop} --make-bed --merge-x no-fail --out target6_${pop}
plink --bfile target6_${pop} --make-bed --split-x b37 no-fail --out target_${pop}
plink --bfile target_${pop} --indep-pairwise 50 5 0.2 --out target_Independent_SNPs_${pop} # keep independent SNPs
plink --bfile target_${pop} --extract target_Independent_SNPs_${pop}.prune.in --check-sex --out target_sexCheck_${pop}
## -- 7 : Test relatedness between samples
plink --bfile target_${pop} --extract target_Independent_SNPs_${pop}.prune.in --genome --out target_rel_${pop} --min !{params.pihat}
## -- 8 : Test heterozygocity and missing rates
plink --bfile target_${pop} --het --chr 1-22 --out het_${pop}
echo "FID IID obs_HOM N_SNPs prop_HET" > het_${pop}.txt
awk 'NR>1 {print $1,$2,$3,$5,($5-$3)/$5}' het_${pop}.het >> het_${pop}.txt
plink --bfile target_${pop} --missing --out miss_${pop}
awk 'NR==FNR {a[$1,$2]=$5;next}($1,$2) in a{print $1,$2,$6,a[$1,$2]}' het_${pop}.txt miss_${pop}.imiss > het_${pop}.imiss.txt
'''}
process QC1{
publishDir params.output+params.target+'/QC1/', mode: 'copy'
input:
file data from QC.collect()
file target from Target.collect()
output:
file ('postGenotyping_samples_QCs.pdf') into FigureQC1
file ('selected_samples_afterSamplesQCsChecking.txt') into TableQC1
shell:
'''
## -- 9 : Figure QC1
Rscript !{baseDir}/bin/after_genotyping_qc.r
'''}
process Filtering2{
input:
file data from TargetFilter.collect()
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
val rspop from Channel.from("CEU","CHB_JPT","YRI")
output:
file ('withFreqFiltering_*') into DirFiltering
file ('target_hwe_*.bim') into HWresult
file ('target_freq_*.frq') into FreqResult
file ('ID-target_*-1000G.txt') into FreqResultId
shell:
'''
pop=!{rspop}
if [ $pop = "CEU" ]; then
pop2="EUR"
elif [ $pop = "CHB_JPT" ]; then
pop2="EAS"
else
pop2="AFR"
fi
## -- 10 : AF based filter
plink --freq --bfile target_${pop} --output-chr chr26 --out target_freq_${pop}
perl HRC-1000G-check-bim-NoReadKey.pl -b target_${pop}.bim -f target_freq_${pop}.frq -r !{params.legend} -g -p ${pop2} -x
mkdir withFreqFiltering_${pop}
cp *1000G* Run-plink.sh withFreqFiltering_${pop}
## -- 11 : HWE filtering
plink --bfile target_${pop} --geno !{params.geno2} --make-bed --out target_geno_${pop}
plink --bfile target_geno_${pop} --hwe !{params.hwe} --make-bed --out target_hwe_${pop}
'''}
process Make_SNP_Filtering{
publishDir params.output+params.target+'/SNP_filtering/', mode: 'copy'
input:
file data from DirFiltering.collect()
file data from HWresult.collect()
file data from Bim.collect()
output:
file ('filtered_snps.txt') into SNPsFilter
file ('filtered_snps.txt') into SNPsFilter2
file ('*.pdf') into resultFigure
shell:
'''
## -- 12 : Create list of SNPs to filter
Rscript !{baseDir}/bin/afterGenotyping_SNPs_filtering.r
'''}
process Filtering3{
input:
file data from Merge2.collect()
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
file data from SNPsFilter.collect()
output:
file ('target5-updated-chr*') into TargetChr
shell:
'''
## -- 13 : Removing SNPs
plink --bfile target4 --exclude filtered_snps.txt --make-bed --out target5
## -- 14 : filter
plink --freq --bfile target5 --out target6
perl HRC-1000G-check-bim-NoReadKey.pl -b target5.bim -f target6.frq -r !{params.legend} -g -x -n
bash Run-plink.sh
'''}
process QC2{
publishDir params.output+params.target+'/QC2/', mode: 'copy'
input:
file data from SNPsFilter2.collect()
file data from FreqResult.collect()
file data from FreqResultId.collect()
val rspop from Channel.from("CEU","CHB_JPT","YRI")
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
file data from TargetQC2.collect()
output:
file ('*.pdf') into FigureQC2
shell:
'''
head -n1 !{legend_file} >> ref_freq_withHeader.txt
grep -Fwf <(awk '{print $2}' !{params.target}-updated.bim) <(cat !{legend_file}) > ref_freq.txt
cat ref_freq.txt >> ref_freq_withHeader.txt
pop=!{rspop}
if [ $pop = "CEU" ]; then
pop2="EUR"
elif [ $pop = "CHB_JPT" ]; then
pop2="EAS"
else
pop2="AFR"
fi
## -- 15 : Figure QC2
Rscript !{baseDir}/bin/preImputation_QC_plots.r ${pop} ${pop2}
'''}
process Filtering4{
input:
file data from TargetChr.collect()
file data from Channel.fromPath(params.folder+'checkVCF.py').collect()
file data from Channel.fromPath(params.fasta+'*').collect()
val chromosome from 1..22
output:
file ('*-REFfixed.vcf.gz') into FilterFinal
file ('*-REFfixed.vcf.gz') into FilterFinal2
file ('*-REFfixed.vcf.gz') into FilterFinal3
shell:
'''
chr=!{chromosome}
export TMPDIR=/tmp/
## -- 16 : Remove ambiguous strand/unknown SNPs
awk '{ if (($5=="T" && $6=="A")||($5=="A" && $6=="T")||($5=="C" && $6=="G")||($5=="G" && $6=="C")) print $2, "ambig" ; else print $2 ;}' target5-updated-chr${chr}.bim | grep -v ambig | grep -v -e --- | sort -u > NonAmbiguous${chr}.snplist.txt
plink --bfile target5-updated-chr${chr} --extract NonAmbiguous${chr}.snplist.txt --output-chr chr26 --make-bed --out target6_chr${chr}
## -- 17 : Create VCF
plink --bfile target6_chr${chr} --output-chr chr26 --recode vcf --out target6_chr${chr}_vcf
bcftools sort target6_chr${chr}_vcf.vcf | bgzip -c > chr${chr}.vcf.gz
## -- 18 : Check SNPs
python2 checkVCF.py -r !{params.fasta} -o after_check_${chr} chr${chr}.vcf.gz
bcftools norm --check-ref ws -f !{params.fasta} chr${chr}.vcf.gz | bcftools view -m 2 -M 2 | bgzip -c > chr${chr}-REFfixed.vcf.gz
python2 checkVCF.py -r !{params.fasta} -o after_check2_${chr} chr${chr}-REFfixed.vcf.gz
'''}
// }
//////////////////////////////////////////////////
if(params.cloud=="off"){
if(params.QC_cloud==null){
process Make_Chunks{
input:
val chromosome from 1..22
file data from FilterFinal.collect()
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
output:
env chunks into NbChunk
env chunks into NbChunk2
tuple env(chunks), val(chromosome) into InfoChrChunk
file ('*.txt') into ChunkSplit
shell:
'''
## -- 19 : Create Chunks
chr=!{chromosome}
bcftools index -f chr${chr}-REFfixed.vcf.gz
bcftools isec -n +2 chr${chr}-REFfixed.vcf.gz !{params.BCFref}ALL.chr${chr}_GRCh38.genotypes.20170504.bcf | bgzip -c > isec_chr_${chr}.vcf.gz
Rscript !{baseDir}/bin/create_chunks.r ${chr}
chunks=$(wc -l chunk_split_chr${chr}.txt | awk '{print $1}')
'''}
process Make_Multiprocessing{
input:
val merge from InfoChrChunk.toList()
output:
file 'multiprocess.txt' into NbChr
file 'multiprocess.txt' into NbChr2
shell:
'''
#!/usr/bin/env python3
## -- 20 : Preparation of multiprocessing for imputation
file=open('multiprocess.txt','w')
liste=!{merge}
for tuples in liste:
for i in range(0,tuples[0]):
file.write(str(tuples[1]) + '\\n')
file.close()
'''}
process Local_Imputation{
input:
val chunks from NbChunk.map{1.."$it".toInteger()}.flatten()
val chromosomes from NbChr.splitText()
file data from Channel.fromPath(params.folder+'HRC-1000G-check-bim-NoReadKey.pl').collect()
file data from FilterFinal2.collect()
file data from ChunkSplit.collect()
output:
file '*imputed.dose.vcf.gz*' into FileVCF
shell:
'''
chr=!{chromosomes}
chunk=!{chunks}
echo "chr: ${chr} n_chunks: ${chunk}"
cpu=1
start=$(awk '{print $1}' <(awk 'NR == c' c="${chunk}" chunk_split_chr${chr}.txt))
end=$(awk '{print $2}' <(awk 'NR == c' c="${chunk}" chunk_split_chr${chr}.txt))
## -- 21 : Phasing
bcftools index -f chr${chr}-REFfixed.vcf.gz
eagle --vcfRef !{params.BCFref}ALL.chr${chr}_GRCh38.genotypes.20170504.bcf --vcfTarget chr${chr}-REFfixed.vcf.gz --vcfOutFormat v --geneticMapFile /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz --outPrefix chr_${chr}_chunk${chunk}.phased --bpStart ${start} --bpEnd ${end} --bpFlanking 5000000 --chrom chr${chr} --numThreads ${cpu} > chr_${chr}_chunk${chunk}_phasing.logphase
## -- 22 : Imputation
minimac4 --refHaps !{params.M3VCFref}ALL.chr${chr}.m3vcf.gz --haps chr_${chr}_chunk${chunk}.phased.vcf --prefix chr_${chr}_chunk${chunk}.imputed --allTypedSites --format GT,DS,GP --cpus ${cpu} --chr chr${chr} --start $start --end $end --window 500000 > chr_${chr}_chunk${chunk}.logimpute
bcftools index -f chr_${chr}_chunk${chunk}.imputed.dose.vcf.gz
'''}
process Concatenation{
publishDir params.output+params.target+'/Result_Imputation/', mode: 'copy'
cpus=16
input:
val chromosomes from 1..22
file data from FileVCF.collect()
output:
file '*dose.vcf.gz' into Imputation
shell:
'''
chr=!{chromosomes}
mkdir chr_${chr}
ls chr_${chr}_chunk*.imputed.dose.vcf.gz> chr_${chr}_imp_res.txt
## -- 23 : Concatenation
#bcftools concat --threads 16 -f chr_${chr}_imp_res.txt -Ou | bcftools sort --temp-dir chr_${chr} -Ou | bgzip -c > chr${chr}.dose.vcf.gz
bcftools concat --threads 16 -f chr_${chr}_imp_res.txt -Ou | bgzip -c > chr${chr}.dose.vcf.gz
'''}
}
}
//////////////////////////////////////////////////
if(params.cloud=="on"){
if (params.token_Michighan){
process Michighan_Imputation{
publishDir params.output+params.target+'/Result_Imputation/', mode: 'move'
input:
file data from Channel.fromPath(params.imputationbot_password).collect()
file data from FilterFinal2.collect()
output:
file 'job-*/local/*dose.vcf.gz' into Imputation_dose
// file '*info.gz' into Imputation_info
// file 'stat/' into Imputation_stat
// file 'log/' into Imputation_log
when:
!params.token_Michighan!=null
shell:
'''
pw=$(cat !{params.imputationbot_password})
token=$(cat !{params.token_Michighan})
imputationbot add-instance https://imputationserver.sph.umich.edu ${token}
imputationbot impute --files chr*-REFfixed.vcf.gz --refpanel hrc-r1.1 --build hg38 --autoDownload --password ${pw} --population mixed
'''
}
}
if (params.token_TOPMed){
process TOPMed_Imputation{
publishDir params.output+params.target+'/Result_Imputation/', mode: 'move', pattern="*dose.vcf.gz"
input:
file data from Channel.fromPath(params.imputationbot_password).collect()
file data from FilterFinal2.collect()
output:
file 'job-*/local/*dose.vcf.gz' into Imputation_dose
// file '*info.gz' into Imputation_info
// file 'stat/' into Imputation_stat
// file 'log/' into Imputation_log
when:
!params.token_TOPMed!=null
shell:
'''
pw=$(cat !{params.imputationbot_password})
token=$(cat !{params.token_TOPMed})
imputationbot add-instance https://imputation.biodatacatalyst.nhlbi.nih.gov ${token}
imputationbot impute --files chr*-REFfixed.vcf.gz --refpanel topmed-r2 --build hg38 --autoDownload --password ${pw} --population mixed
'''
}
}
process QC3_sh{
input:
val population from('ALL','CEU','YRI','CHB_JPT')
each chromosome from 1..22
file data from Channel.fromPath(params.output+params.target+'/Result_Imputation/*dose.vcf.gz').collect()
file data from Admixture2.collect()
file data from Channel.fromPath(params.BCFref+'/*').collect()
output:
file '*.{txt,frq}' into PostImputation_QC_sh_result
// file '*dose.vcf.gz' into Imputation_dose2
shell:
'''
pop=!{population}
chr=!{chromosome}
## -- 24 : QC3
bash !{baseDir}/bin/postImputation_QC.sh ${chr} ${pop}
'''
}
process QC3_R{
if(params.QC_cloud==null){publishDir params.output+params.target+'/QC3/', mode: 'copy'}
else{publishDir params.output+params.target+'/QC3_cloud/', mode: 'copy'}
cpus 22
input:
val population from('ALL','CEU','YRI','CHB_JPT')
file data from PostImputation_QC_sh_result.collect()
output:
file '*v2.{txt,pdf}' into PostImputation_QC_R_result
file '*CHR.pdf' into PostImputation_QC_R_result2
shell:
'''
pop=!{population}
Rscript !{baseDir}/bin/postImputation_QC_plots.r ${pop} 0.3
'''
}
}