-
Notifications
You must be signed in to change notification settings - Fork 0
/
final.py
209 lines (193 loc) · 7.48 KB
/
final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import multiprocessing
from Comb_fastq import combine
from utils import *
import sys
import gzip
import os
def cut(step,length_bin,link,i):
start = step*i
end = length_bin + start
if end > len(link):
end = len(link)
if start>=end or end-start<=length_bin-step:
return False,''
return True,link[start:end]
def writefiles(UnmappedReads,step,length_bin,max_length,outputname):
Part_Fastq_Filename = []
for i in range(max_length):
filecontent = []
for readsname in UnmappedReads:
link = UnmappedReads[readsname]
mark,cutreads = cut(step,length_bin,link[0],i)
if not mark: continue
_,cutquality = cut(step,length_bin,link[1],i)
filecontent.append(readsname+'\n'+cutreads+'\n+\n'+cutquality+'\n')
if len(filecontent)==0: break
name = outputname+'.part'+str(i+1)+'.fastq'
Part_Fastq_Filename.append(name)
with open(name,'a') as f:
f.writelines(filecontent)
print(Part_Fastq_Filename)
return Part_Fastq_Filename
def do_process(l):
#print(l+'in')
temp = l.strip().split()
length = len(temp)
if length<=0 or length>2:
print("Parameter error in "+l)
sys.exit()
refpath='/data/dsun/ref/mouseigenome/mm9.fa'
#refpath = '/data/dsun/ref/humanigenome/hg19.fa'
tempname=temp[0].lower()
if tempname.endswith(".gz"):
tempname = tempname[:-3]
if tempname.endswith(".fq"):
tempname = tempname[:-3]
if tempname.endswith(".fastq"):
tempname = tempname[:-6]
outputname = temp[0][:len(tempname)]
#print(outputname)
phred=33
if length==2 :
commend='bsmap -a '+temp[0]+' -b '+temp[1]+' -z '+str(phred)+' -d '+refpath+' -o '+outputname+'.bam -n 1 -r 0'
else:
commend='bsmap -a '+temp[0]+' -z '+str(phred)+' -d '+refpath+' -o '+outputname+'.bam -n 1 -r 0'
First_try = Pshell(commend)
First_try.process()
command='samtools sort -@ 4 '+outputname+'.bam'+' -o '+outputname+'.sorted.bam'
First_try.change(command)
First_try.process()
command='mv '+outputname+'.sorted.bam '+outputname+'.bam'
First_try.change(command)
First_try.process()
command='rm '+outputname+'.sorted.bam'
First_try.change(command)
First_try.process()
#Test1 done
inputfileinfo=l.strip().split()
commend = 'samtools view '+outputname+'.bam > '+outputname+'.sam'
BamFileReader = Pshell(commend)
BamFileReader.process()
with open(outputname+".sam") as sam:
#second column in sam file: 64, mate 1; 128, mate 2;
samlines = sam.readlines()
set_sam = {}
for line in samlines:
temp = line.strip().split()
m1 = (int(temp[1]) & 64)
m2 = (int(temp[1]) & 128)
# print(temp[1],m1,m2)
if m1>0: mate = 1
elif m2>0: mate = 2
else: mate = 0
if temp[0] in set_sam: set_sam[temp[0]]=3
else: set_sam[temp[0]]=mate
# print(mate)
# for k in set_sam:
# print(k)
# break
del samlines
UnmappedReads = {}
o=0
step = 5
length_bin = 30#30
max_length = 24#50
Part_Fastq_Filename=[]
for filename in inputfileinfo:
o+=1
gzmark=False
if filename.endswith('.gz'):
f = gzip.open(filename)
gzmark=True
else:
f = open(filename)
if f:
while 1:
if gzmark:
line1 = f.readline().decode()
else:
line1 = f.readline()
if not line1:
break
if gzmark:
line2 = f.readline().decode().strip()
line3 = f.readline().decode()
line4 = f.readline().decode().strip()
else:
line2 = f.readline().strip()
line3 = f.readline()
line4 = f.readline().strip()
line1 = line1.strip().split()
line1[0] = line1[0]
# print(line1[0][1:])
if (line1[0][1:] in set_sam):
string_mark = o
if line1[1][0]>='1' and line1[1][0]<='2':
string_mark = int(line1[1][0])
if set_sam[line1[0][1:]]==0 or set_sam[line1[0][1:]]==3 : continue
if set_sam[line1[0][1:]]==string_mark : continue
temp = line1[0]
if length>1: temp+='_'+line1[1][0]
#Maybe the mate search method is buggy. Cuz there are different structures of reads name generated by different sequencing machine.
#fastqlines[i] = line1[0]+'_'+line1[1][0]+' '+line1[1]
UnmappedReads[temp]=[line2,line4]
if len(UnmappedReads)>10000000:
pfn = writefiles(UnmappedReads,step,length_bin,max_length,outputname)
UnmappedReads={}
if len(pfn)>len(Part_Fastq_Filename):
Part_Fastq_Filename=pfn
#We've got a dictionary named UnmappedReads = {readsname:[line1,line2,line3,line4]}
#Change cut funtion into cut(setp,length_bin,string,fileorder), return Available(T/F), reads_fraction
if len(UnmappedReads)>0:
pfn=writefiles(UnmappedReads,step,length_bin,max_length,outputname)
if len(pfn)>len(Part_Fastq_Filename):
Part_Fastq_Filename=pfn
print('finish')
f.close()
del UnmappedReads
#We've got the splited fastq file, filename is stored in Part_Fastq_Filename
# p = multiprocessing.Pool(processes=7)
for i in range(len(Part_Fastq_Filename)):
commend = 'bsmap -a '+Part_Fastq_Filename[i]+' -z '+str(phred)+' -d '+refpath+' -o '+Part_Fastq_Filename[i]+'.bam -n 1 -r 0 -R'
Batch_try = Pshell(commend)
Batch_try.process()
#run bsmap and get bam files named as Part_Fastq_Filename[i].bam
#import combine to generate the finalfastq
combine(outputname,Part_Fastq_Filename,step,length_bin)
commend = 'bsmap -a '+outputname+'_finalfastq.fastq -d '+refpath+' -z '+str(phred)+' -o '+outputname+'_split.bam -n 1 -r 0'
Bam = Pshell(commend)
Bam.process()
splitfilename = outputname+'_split.bam'
header = outputname+'.header'
command='samtools view -H '+splitfilename+' > '+header
filter = Pshell(command)
filter.process()
split_length=40
command='samtools view '+splitfilename+"| awk '{if (length($10)>"+str(split_length)+") print}' > "+splitfilename+'.sam'
filter.change(command)
filter.process()
command='cat '+header+' '+splitfilename+'.sam | samtools view -bS - > '+splitfilename+'.bam'
filter.change(command)
filter.process()
command='samtools sort -@ 4 '+splitfilename+'.bam'+' -o '+splitfilename+'.sorted.bam'
filter.change(command)
filter.process()
command='mv '+splitfilename+'.sorted.bam '+splitfilename
filter.change(command)
filter.process()
command='rm '+splitfilename+'.bam '+splitfilename+'.sam'
filter.change(command)
filter.process()
m=Pshell('samtools merge '+outputname+'_combine.bam '+outputname+'.bam '+outputname+'_split.bam')
m.process()
print("Merge done!\nCreated final bam file called "+outputname+'_combine.bam')
if __name__=="__main__":
with open("config.txt") as f:
lines = f.readlines()
import multiprocessing
pool = multiprocessing.Pool(processes=2)
for l in lines:
pool.apply_async(do_process,(l,))
#do_process(l) #pass file name to do_process
pool.close()
pool.join()