-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprepare_contamination.nf
112 lines (98 loc) · 3.94 KB
/
prepare_contamination.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
process download_host {
label 'minimap2'
if (params.cloudProcess) {
publishDir "${params.databases}/hosts", mode: params.publish_dir_mode, pattern: "*.fa.gz"
}
else {
storeDir "${params.databases}/hosts"
}
input:
val host
output:
path "${host}.fa.gz"
script:
"""
case $host in
hsa)
wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz
;;
mmu)
wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz
;;
cli)
wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -o host-temp.fa.gz
;;
csa)
wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -o host-temp.fa.gz
;;
gga)
wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -o host-temp.fa.gz
;;
eco)
wget 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -o host-temp.fa.gz
;;
sc2)
wget 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -O host-temp.fa || curl 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -o host-temp.fa
gzip host-temp.fa
;;
t2t)
wget 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -o host-temp.fa.gz
;;
*)
echo "Unknown host ($host)."
;;
esac
zcat < host-temp.fa.gz | bgzip -@ ${task.cpus} -c > ${host}.fa.gz
"""
stub:
"""
touch ${host}.fa.gz
"""
}
process check_own {
label 'seqkit'
input:
path fasta
output:
path 'checked.fa.gz'
script:
"""
seqkit seq ${fasta} -o checked.fa.gz
"""
stub:
"""
touch checked.fa.gz
"""
}
process concat_contamination {
label 'seqkit'
publishDir (
path: "${params.output}/intermediate",
mode: params.publish_dir_mode,
pattern: "db.fa.gz",
enabled: !params.no_intermediate,
saveAs: { "host.fa.gz" }
)
publishDir (
path: "${params.output}/intermediate",
mode: params.publish_dir_mode,
pattern: "db.fa.fai",
enabled: !params.no_intermediate,
saveAs: { "host.fa.fai" }
)
input:
path fastas
output:
path 'db.fa.gz', emit: fa
path 'db.fa.fai', emit: fai
script:
"""
# Combine input files, rename duplicate sequences (by id) if found, and compress
seqkit seq ${fastas} | seqkit rename | bgzip -@ ${task.cpus} -c > db.fa.gz
samtools faidx db.fa.gz --fai-idx db.fa.fai
"""
stub:
"""
touch db.fa.gz db.fa.fai
"""
}