-
Notifications
You must be signed in to change notification settings - Fork 5
/
StrainScan.py
275 lines (251 loc) · 9.14 KB
/
StrainScan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import re
import os
import sys
import argparse
sys.path.append('library')
from collections import defaultdict
from library import identify,identify_low_mem,identify_low_depth,Vote_Strain_L2_Lasso_new_sp
import pickle
__author__="Liao Herui, Ji Yongxin - PhD of City University of HongKong"
usage="StrainScan - A kmer-based strain-level identification tool."
def initial_para(para,value):
if not para:
para=value
return para
def build_dir(in_dir):
if not os.path.exists(in_dir):
os.makedirs(in_dir)
def get_overlap_kmr(db_dir,apcls,pcls):
overlap_kmr=defaultdict(lambda:{})
carr=[]
for c in apcls:
carr.append(c)
all_k={}
for c in carr:
#if c not in pcls:continue
all_k[c]=pickle.load(open(db_dir+'/Kmer_Sets_L1/Kmer_Sets/'+c+'/all_kid.pkl','rb'))
overlap_kmr={}
for c in apcls:
if c not in pcls:continue
cud=all_k[c]
overlap_kmr[c]={}
for c2 in apcls:
if c==c2:continue
pov1=cud.keys() & all_k[c2].keys()
pov=dict.fromkeys(pov1,'')
overlap_kmr[c]=dict(overlap_kmr[c],**pov)
return overlap_kmr
def load_db_cls(db_dir,cls_dict,odir,rgenome):
dr={} # pre -> dir
for r in os.listdir(rgenome):
pre=re.split('\.',r)[0]
bre=re.split('\.',r)[-1]
if bre not in ['fna','fa','fasta']:continue
dr[pre]=rgenome+'/'+r
f=open(db_dir+'/Cluster_Result/hclsMap_95_recls.txt','r')
dc={} # cluster_id -> strains
while True:
line=f.readline().strip()
if not line:break
ele=line.split('\t')
st=re.split(',',ele[-1])
dc[int(ele[0])]=st
out_dir=odir+'/ref_plasmids'
if not os.path.exists(out_dir):
os.makedirs(out_dir)
o2=open(odir+'/possible_plasmids.txt','w+')
for c in cls_dict:
if not cls_dict[c]['strain']==0:continue
c=int(c)
for s in dc[c]:
count = 0
#print(s,dr[s])
#exit()
f2=open(dr[s],'r')
dtem={}
while True:
line=f2.readline().strip()
if not line:break
if re.search('>',line):
#dtem[]=''
name=line
dtem[name]=''
else:
dtem[name]+=line
o=open(out_dir+'/'+s+'.fasta','w+')
for r in dtem:
if len(dtem[r])<100000:
o.write(r+'\n'+dtem[r]+'\n')
count+=1
o2.write(s+'\t'+r+'\n')
o.close()
if count==0:
os.system('rm '+out_dir+'/'+s+'.fasta')
return out_dir
def generate_prob_report(prob_dict,out_dir,db_dir):
f=open(db_dir+'/hclsMap_95_recls.txt','r')
d={}
while True:
line=f.readline().strip()
if not line:break
ele=line.split('\t')
d[int(ele[0])]=ele[-1]
op=open(out_dir+'/strain_prob.txt','w+')
op.write('Cluster_ID\tProbability\tNumber_of_strains\tStrains_in_the_cluster\n')
for p in prob_dict:
st=re.split(',',d[p[0]])
op.write('C'+str(p[0])+'\t'+str(p[1])+'\t'+str(len(st))+'\t'+d[p[0]]+'\n')
op.close()
def main():
pwd=os.getcwd()
# Get para
parser=argparse.ArgumentParser(prog='StrainScan.py',description=usage)
parser.add_argument('-i','--input_fastq',dest='input_fq',type=str,required=True,help="The dir of input fastq data --- Required")
parser.add_argument('-j','--input_fastq_2',dest='input_fq2',type=str,help="The dir of input fastq data (for pair-end data).")
parser.add_argument('-d','--database_dir',dest='db_dir',type=str,required=True,help="The dir of your database --- Required")
parser.add_argument('-o','--output_dir',dest='out_dir',type=str,help='Output dir (default: current dir/StrainVote_Result)')
parser.add_argument('-k','--kmer_size',dest='ksize',type=str,help='The size of kmer, should be odd number. (default: k=31)')
parser.add_argument('-l','--low_dep',dest='ldep',type=str,help='This parameter can be set to \"1\" if the sequencing depth of input data is very low (e.g. < 10x). For super low depth ( < 1x ), you can use \"-l 2\" (default: -l 0)')
parser.add_argument('-b', '--strain_prob', dest='sprob', type=str,help='If this parameter is set to 1, then the algorithm will output the probability of detecting a strain (or cluster) in low-depth (e.g. <1x) samples. (default: -b 0)')
parser.add_argument('-p', '--plasmid_mode', dest='pmode', type=str, help='If this parameter is set to 1, the intra-cluster searching process will search possible plasmids using short contigs (<100000 bp) in strain genomes, which are likely to be plasmids. If this parameter is set to 2, the intra-cluster searching process will search possible strains using given reference genomes by \"-r\". Reference genome sequences (-r) are required if this mode is used. (default: -p 0)')
parser.add_argument('-r', '--ref_genome', dest='rgenome', type=str,help='The dir of reference genomes of identified cluster or all strains. If plasmid_mode is used, then this parameter is required.')
parser.add_argument('-e', '--extraRegion_mode', dest='emode', type=str,help='If this parameter is set to 1, the intra-cluster searching process will search possible strains and return strains with extra regions (could be different genes, SNVs or SVs to the possible strains) covered. (default: -e 0)')
parser.add_argument('-s', '--minimum_snv_num', dest='msn', type=str,help='The minimum number of SNV at Layer-2 identification. (default: k=40)')
args=parser.parse_args()
fq_dir=args.input_fq
fq2=args.input_fq2
db_dir=args.db_dir
out_dir=args.out_dir
ksize=args.ksize
ksize=initial_para(ksize,31)
ldep=args.ldep
sprob=args.sprob
pmode=args.pmode
emode=args.emode
rgenome=args.rgenome
msn=args.msn
if not fq2:
fq2=''
if not ldep:
ldep=0
else:
ldep=int(ldep)
if not sprob:
sprob=0
else:
sprob=int(sprob)
if not pmode:
pmode=0
else:
pmode=int(pmode)
if not rgenome:
rgenome=''
if pmode==1 and rgenome=='':
print('Warning: You have to provide the dir of reference genome sequences if you want to use plasmid mode!')
exit()
if not emode:
emode=0
else:
emode=int(emode)
if not msn:
msn=40
else:
msn=int(msn)
out_dir=initial_para(out_dir,pwd+'/StrainScan_Result')
if not re.search('/',out_dir):
out_dir=pwd+'/'+out_dir
build_dir(out_dir)
# Step1 -> Vote possible clusters using Krakenuniq
#pcls,uniq_strain,apcls=Vote_Cls_KK.vote_cls(fq_dir,db_dir,out_dir)
#overlap_kmr=get_overlap_kmr(db_dir,apcls,pcls)
# Step2 -> Vote Cls
in_fq=(fq_dir,fq2)
l2=0
#cls_dict=identify_cluster_u.identify_cluster(in_fq,'/home/yongxinji2/worktemp/Tree_database')
if sprob==1:
prob_dict=identify_low_depth.identify_ranks(in_fq,db_dir+'/Tree_database')
generate_prob_report(prob_dict,out_dir,db_dir+'/Tree_database')
if os.path.exists(db_dir+'/Memory_DB'):
mdb=1
else:
mdb=0
if ldep==0:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, db_dir + '/Tree_database', [0.1, 0.4, 1])
else:
cls_dict=identify.identify_cluster(in_fq,db_dir+'/Tree_database',[0.1,0.4,1])
if len(cls_dict)==0:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, db_dir + '/Tree_database', [0.05,0.05,1])
else:
cls_dict=identify.identify_cluster(in_fq,db_dir+'/Tree_database',[0.05,0.05,1])
l2=1
if len(cls_dict)==0:
print('Warning: No clusters can be detected!')
exit()
elif ldep==1:
if mdb == 1:
cls_dict = identify_low_mem.identify_cluster(in_fq, db_dir + '/Tree_database', [0.01,0.05,1])
else:
cls_dict=identify.identify_cluster(in_fq,db_dir+'/Tree_database',[0.01,0.05,1])
l2=1
elif ldep==2:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, db_dir + '/Tree_database', [0.005,0.01,1])
else:
cls_dict=identify.identify_cluster(in_fq,db_dir+'/Tree_database',[0.005,0.01,1])
l2=1
print(cls_dict)
if len(cls_dict)==0:
print('Warning: No clusters can be detected!')
exit()
# Step3 -> Vote Strains inside Cls
if pmode==1 or pmode==2:
if pmode==1:
plas_ref=load_db_cls(db_dir,dict(cls_dict),out_dir,rgenome)
plas_ref=os.path.abspath(plas_ref)
else:
plas_ref=os.path.abspath(rgenome)
#print(plas_ref)
#exit()
#for c in dict(cls_dict):
d_out_dir=os.path.abspath(out_dir)
os.system('python StrainScan_build.py -i '+plas_ref+' -o '+d_out_dir+'/DB_plasmid -n 500' )
#pdb=out_dir+'/DB_plasmid'
#exit()
tdb=d_out_dir+'/DB_plasmid/Tree_database'
if ldep == 0:
if mdb == 1:
cls_dict = identify_low_mem.identify_cluster(in_fq, tdb, [0.1, 0.4, 1])
else:
cls_dict = identify.identify_cluster(in_fq, tdb, [0.1, 0.4, 1])
if len(cls_dict) == 0:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, tdb, [0.05, 0.05, 1])
else:
cls_dict = identify.identify_cluster(in_fq, tdb, [0.05, 0.05, 1])
l2 = 1
if len(cls_dict) == 0:
print('Warning: No clusters can be detected!')
exit()
elif ldep == 1:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, tdb, [0.01, 0.05, 1])
else:
cls_dict = identify.identify_cluster(in_fq, tdb, [0.01, 0.05, 1])
l2 = 1
elif ldep == 2:
if mdb==1:
cls_dict = identify_low_mem.identify_cluster(in_fq, tdb, [0.005, 0.01, 1])
else:
cls_dict = identify.identify_cluster(in_fq, tdb, [0.005, 0.01, 1])
l2 = 1
kdb=d_out_dir+'/DB_plasmid'
Vote_Strain_L2_Lasso_new_sp.vote_strain_L2_batch(fq_dir,fq2, kdb, out_dir, ksize, dict(cls_dict), l2, msn,pmode,emode)
#exit()
elif emode==1:
Vote_Strain_L2_Lasso_new_sp.vote_strain_L2_batch(fq_dir,fq2, db_dir, out_dir, ksize, dict(cls_dict), l2, msn,pmode,emode)
else:
Vote_Strain_L2_Lasso_new_sp.vote_strain_L2_batch(fq_dir,fq2, db_dir,out_dir,ksize,dict(cls_dict),l2,msn,pmode,emode)
if __name__=='__main__':
sys.exit(main())