-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile
170 lines (144 loc) · 5.12 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import glob
configfile: "config.yaml"
rule all:
input:
matrix = f"{config['output_dir']}/presence_absence_matrix.txt",
summary_file = f"{config['output_dir']}/pangenome_summary.tsv"
if config["BAKTA"]["exec"] and config["GFF"]["exec"]:
print("cannot have BAKTA and GFF true in config file")
exit(1)
# if wanting to annotate MAGs with BAKTA
if config["BAKTA"]["exec"]:
rule bakta:
input:
genome = f"{config['genome_fasta']}/{{sample}}.fasta"
output:
ann_dir = directory(f"{config['output_dir']}/annotated/{{sample}}_ann")
conda:
"bakta"
threads: 1
resources:
mem_mb=lambda wildcards, attempt: attempt * 15000
params:
trn = config['training_file'],
DB = directory(f"{config['bakta_db']}")
log:
f"{config['output_dir']}/logs/bakta/{{sample}}.log"
shell:
"""
bakta {input.genome} --db {params.DB} --prefix {wildcards.sample} --prodigal-tf {params.trn} \
--translation-table 11 --threads {threads} --output {output.ann_dir} >{log} 2>&1
"""
def get_samples(genome_dir):
list_of_samples = glob.glob(genome_dir + "/*.fasta")
new_list_of_samples = []
for sam in list_of_samples:
sam = sam.split("/")[-1]
sam = sam.replace(".fasta", "")
new_list_of_samples.append(sam)
return new_list_of_samples
rule fix_ffn_file:
input:
annotations = expand(f"{config['output_dir']}/annotated/{{sample}}_ann", sample=get_samples(config['genome_fasta']))
output:
fixed_annotations = directory(f"{config['output_dir']}/all_ffn")
conda: #just needs biopython
"Snakemake"
script: "scripts/fix_ffn_files.py"
# if you have existing GFF to use:
elif config["GFF"]["exec"]:
rule transform_gff:
input:
gff_dir = f"{config['gff_dir']}"
output:
ffn_files = directory(f"{config['output_dir']}/all_ffn")
threads: 1
resources:
mem_mb=lambda wildcards, attempt: attempt * 15000
conda:
# requires BCBio
"panphlan"
shell:
"""
python scripts/panphlan_pangenome_generation.py --i_gff {input}
mv ffn_from_gff {output.ffn_files}
"""
rule concat:
input:
ffn_files = f"{config['output_dir']}/all_ffn"
output:
f"{config['output_dir']}/all_samples.concat.ffn"
shell:
"cat {input.ffn_files}/* > {output}"
rule mmseqs2:
input:
f"{config['output_dir']}/all_samples.concat.ffn"
output:
all_seqs = f"{config['output_dir']}/mmseqs/mmseqs_all_seqs.fasta",
clusters = f"{config['output_dir']}/mmseqs/mmseqs_cluster.tsv",
rep_seq = f"{config['output_dir']}/mmseqs/mmseqs_rep_seq.fasta"
threads: 16
resources:
mem_mb=lambda wildcards, attempt: 16000 * attempt
log:
f"{config['output_dir']}/logs/mmseqs2.log"
params:
seq_id = 0.9,
cov_mode = 0,
c = 0.8,
output_prefix = f"{config['output_dir']}/mmseqs/mmseqs",
tmp_dir = f"{config['output_dir']}/mmseqs/tmp"
conda:
"mmseq2"
shell:
"mmseqs easy-cluster {input} {params.output_prefix} {params.tmp_dir} --min-seq-id {params.seq_id} \
--cov-mode {params.cov_mode} -c {params.c} --threads {threads} >{log} 2>&1"
rule rep_seq_list:
input:
f"{config['output_dir']}/mmseqs/mmseqs_rep_seq.fasta"
output:
f"{config['output_dir']}/mmseqs/rep_sequences.list"
shell:
"grep '>' {input} | cut -f2 -d'>' | cut -f1 -d' ' > {output}"
rule sort_mmseqs2_clusters:
input:
clusters = rules.mmseqs2.output.clusters
output:
sorted_clusters = f"{config['output_dir']}/mmseqs/mmseqs_cluster.sorted.tsv"
threads: 1
resources:
mem_mb=2000
shell: "sort {input.clusters} > {output.sorted_clusters}"
rule build_matrix:
input:
rep_list = f"{config['output_dir']}/mmseqs/rep_sequences.list",
clusters = f"{config['output_dir']}/mmseqs/mmseqs_cluster.sorted.tsv"
output:
matrix = f"{config['output_dir']}/presence_absence_matrix.txt"
threads: 1
resources:
mem_mb=5000
conda:
# env has biopython and pandas
"poppunk"
script: "scripts/make_presence_absence_matrix.py"
rule summarise_pangenome:
input:
rep_seq = f"{config['output_dir']}/mmseqs/mmseqs_rep_seq.fasta",
matrix= f"{config['output_dir']}/presence_absence_matrix.txt"
output:
gene_descriptions = f"{config['output_dir']}/mmseqs/rep_seq_descriptions.txt",
summary_file = f"{config['output_dir']}/pangenome_summary.tsv"
threads: 1
resources:
mem_mb=5000
params:
core = config['core']
conda:
# env has biopython and pandas
"poppunk"
shell:
"""
grep ">" {input.rep_seq} > {output.gene_descriptions}
python scripts/summarise_pangenome.py {params.core} {output.gene_descriptions} {input.matrix} {output.summary_file}
"""