forked from Kennedy-Lab-UW/Duplex-Sequencing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtag_to_header.py
executable file
·291 lines (229 loc) · 11.3 KB
/
tag_to_header.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env python
#
# Tag To Header
# Version 2.0.1
# By Joe Hiatt, Scott Kennedy(1), Brendan Kohrn and Mike Schmitt(1)
# (1) Department of Pathology, University of Washington School of Medicine, Seattle, WA 98195
# March 24, 2014
#
# Isolate duplex tags, move them from within the sequenced read to the header region, and remove the spacer region.
#
# usage: tag_to_header.py [-h] [--infile1 INFILE1] [--infile2 INFILE2]
# [--outfile1 OUTFILE1] [--outfile2 OUTFILE2]
# [--taglen BLENGTH] [--spacerlen SLENGTH]
# [--read_out ROUT] [--filt_spacer ADAPTERSEQ] --tagstats
#
# Optional arguments:
# -h, --help show this help message and exit
# --infile1 INFILE1 First input raw fastq file.
# --infile2 INFILE2 Second input raw fastq file.
# --outfile1 OUTFILE1 Output file for first fastq reads.
# --outfile2 OUTFILE2 Output file for second fastq reads.
# --taglen BLENGTH Length of the duplex tag sequence. [12]
# --spacerlen SLENGTH Length of the spacer sequences used. [5]
# --read_out ROUT How often you want to be told what the program is
# doing. [1000000]
# --filt_spacer ADAPTERSEQ Optional: Spacer sequence for filtering on the
# presence of the spacer. This could be thrown off by
# low quality scores.
# --tagstats Optional: Output tagstats file and make distribution plot of tag family sizes.
# Requires matplotlib to be installed
import sys
from argparse import ArgumentParser
from collections import defaultdict
def fastq_general_iterator(read1_fastq, read2_fastq):
read1_readline = read1_fastq.readline
read2_readline = read2_fastq.readline
while True:
read1_line = read1_readline()
read2_line = read2_readline()
if not read1_line and read2_line:
return
if read1_line[0] == '@' and read2_line[0] == '@':
break
if isinstance(read1_line[0], int) or isinstance(read2_line[0], int):
raise ValueError("FASTQ files may contain binary information or are compressed")
while read1_line and read2_line:
if read1_line[0] != '@' or read2_line[0] != '@':
print read1_line, read2_line
raise ValueError("Records in FASTQ files should start with a '@' character. Files may be malformed or out of synch.")
title_read1_line = read1_line[1:].rstrip()
title_read2_line = read2_line[1:].rstrip()
read1_seq_string = read1_readline().rstrip()
read2_seq_string = read2_readline().rstrip()
while True:
read1_line = read1_readline()
read2_line = read2_readline()
if not read1_line and read2_line:
raise ValueError("End of file without quality information. Files may be malformed or out of synch")
if read1_line[0] == '+' and read2_line[0] == '+':
break
read1_seq_string += read1_line.rstrip()
read2_seq_string += read2_line.rstrip()
read1_quality_string = read1_readline().rstrip()
read2_quality_string = read2_readline().rstrip()
while True:
read1_line = read1_readline()
read2_line = read2_readline()
if not read1_line or read2_line:
break # end of file
if read1_line[0] == '@' and read2_line[0] == '@' and read1_line.isalpha() is not True \
and read2_line.isalpha() is not True:
break
read1_quality_string += read1_line.rstrip()
read2_quality_string += read2_line.rstrip()
yield (title_read1_line, title_read2_line, read1_seq_string, read2_seq_string, read1_quality_string, read2_quality_string)
raise StopIteration
def tag_extract_fxn(read_seq, blen):
# This is the function that extracts the UID tags from both the
# forward and reverse read. Assigns read1 the sequence from some
# position to the end, then read2 from some position to the end,
# then assigns tag1 from the 5'-end to length of the UID tag for
# read1 and then read 2.
return read_seq[0][:blen], read_seq[1][:blen]
def hdr_rename_fxn(read_title, read1_tag, read2_tag):
# This function renames the header with the formatting of
# *header coordinates,etc*, *tag from read1*, *tag from read2*,
# *read designation from original header (for paired reads)*
illumina = read_title.split(" ")[0].split(":")
if len(illumina) == 7:
#Illumina CASAVA >=1.8
#e.g. @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACGcd
readnum = read_title.split(" ")[1].split(":")[0]
return "%s|%s%s/%s" % (read_title.split(" ")[0], read1_tag, read2_tag, readnum)
elif len(illumina) == 5:
#Illumina CASAVA >=1.4?
#e.g. @HWUSI-EAS100R:6:73:941:1973#ATCGAT/1
read_title = read_title.replace(' ', '_')
return "%s|%s%s/%s" % (read_title.split('/')[0], read1_tag, read2_tag, read_title.split('/')[1])
else :
raise ValueError("Unknown read name format: %s" % read_title)
def tag_stats(barcode_counts, outfile):
family_size_dict = defaultdict(lambda:0)
tagstat_file = open(outfile + '.tagstats', 'w')
total_tags = 0
for value in barcode_counts:
family_size_dict[value] += 1
for family_size in family_size_dict.keys():
family_size_dict[family_size] *= int(family_size)
total_tags += int(family_size_dict[family_size])
for family_size in sorted(family_size_dict.keys()):
tagstat_file.write("%s\t%s\n" % (family_size, float(family_size_dict[family_size])/float(total_tags)))
tagstat_file.close()
return family_size_dict, total_tags
def main():
parser = ArgumentParser()
parser.add_argument('--infile1', dest='infile1', help='Path to FASTQ file for Read 1.', required=True)
parser.add_argument('--infile2', dest='infile2', help='Path to FASTQ file for Read 2.', required=True)
parser.add_argument('--outprefix', dest='outfile', help='Prefix for output files. Will prepend onto file name of ".fq.smi"', required=True)
parser.add_argument('--taglen', dest='taglen', type=int, default=12,
help='Length in bases of the duplex tag sequence.[12')
parser.add_argument('--spacerlen', dest='spclen', type=int, default=5,
help='Length in bases of the spacer sequence between duplex tag and the start of target DNA. [5]')
parser.add_argument('--readout', dest='readout', type=int, default=1000000,
help='How many reads are processed before progress is reported. [1000000')
parser.add_argument('--filtspacer', dest='spacer_seq', type=str, default=None,
help='Optional: Filter out sequences lacking the inputed spacer sequence. \
Not recommended due to significant base calling issues with the invariant spacer sequence')
parser.add_argument('--tagstats', dest='tagstats', action="store_true",
help='Optional: Output tagstats file and make distribution plot of tag family sizes. \
Requires matplotlib to be installed.')
parser.add_argument('--reduce', dest='reduce', action="store_true", help='Optional: Only output reads that will make \
a final DCS read. Will only work when the --tagstats option is invoked.')
o = parser.parse_args()
if o.reduce and not o.tagstats:
raise ValueError("--reduce option must be invoked with the --tagstats option.")
read1_fastq = open(o.infile1, 'r')
read2_fastq = open(o.infile2, 'r')
read1_output = open(o.outfile + '.seq1.smi.fq', 'w')
read2_output = open(o.outfile + '.seq2.smi.fq', 'w')
readctr = 0
nospacer = 0
goodreads = 0
badtag = 0
oldBad = 0
barcode_dict = defaultdict(lambda:0)
for read1_title, read2_title, read1_seq, read2_seq, read1_qual, read2_qual in fastq_general_iterator(read1_fastq, read2_fastq):
readctr += 1
if o.spacer_seq != None and (read1_seq[o.taglen:o.taglen + o.spclen] != o.spacer_seq or read2_seq[o.taglen:o.taglen + o.spclen] != o.spacer_seq):
nospacer += 1
else:
tag1, tag2 = tag_extract_fxn((read1_seq, read2_seq), o.taglen)
if (tag1.isalpha() and tag1.count('N') == 0) and (tag2.isalpha() and tag2.count('N') == 0):
renamed_read1_title = hdr_rename_fxn(read1_title, tag1, tag2)
renamed_read2_title = hdr_rename_fxn(read2_title, tag1, tag2)
read1_output.write('@%s\n%s\n+\n%s\n' % (renamed_read1_title, read1_seq[o.taglen+o.spclen:], read1_qual[o.taglen + o.spclen:]))
read2_output.write('@%s\n%s\n+\n%s\n' % (renamed_read2_title, read2_seq[o.taglen+o.spclen:], read2_qual[o.taglen + o.spclen:]))
goodreads += 1
if o.tagstats:
barcode_dict[tag1 + tag2] += 1
else:
badtag += 1
if readctr % o.readout is 0:
sys.stderr.write("Total sequences processed: %s\n" % readctr)
sys.stderr.write("Sequences with passing tags: %s\n" % goodreads)
sys.stderr.write("Missing spacers: %s\n" % nospacer)
sys.stderr.write("Bad tags: %s\n" % badtag)
if badtag == oldBad + o.readout:
sys.stderr.write("Warning! Potential file error between lines %s and %s." % ((readctr - o.readout) * 4, readctr * 4))
oldBad = badtag
read1_fastq.close()
read2_fastq.close()
read1_output.close()
read2_output.close()
sys.stderr.write("Total sequences processed: %s\n" % readctr)
sys.stderr.write("Sequences with passing tags: %s\n" % goodreads)
sys.stderr.write("Missing spacers: %s\n" % nospacer)
sys.stderr.write("Bad tags: %s\n" % badtag)
if o.tagstats:
read_data_file = open(o.outfile + '_data.txt', 'w')
sscs_count = 0
dcs_count = 0
dcs_tags_list = []
family_size_dict, total_tags = tag_stats(barcode_dict.values(), o.outfile)
for tag in barcode_dict.keys():
if barcode_dict[tag] >= 3:
sscs_count += 1
if tag[o.taglen:] + tag[:o.taglen] in barcode_dict and barcode_dict[tag[o.taglen:] + tag[:o.taglen]] >= 3:
dcs_count += 1
if o.reduce and tag not in dcs_tags_list:
dcs_tags_list.append(tag)
dcs_tags_list.append(tag[o.taglen:] + tag[:o.taglen])
read_data_file.write('# Passing Reads\t# SSCS Reads\t# DCS Reads\tSSCS:DCS\n%d\t%d\t%d\t%f\n'
% (goodreads, sscs_count, dcs_count, float(sscs_count)/float(dcs_count)))
read_data_file.close()
try:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
x_value = []
y_value = []
for family_size in sorted(family_size_dict.keys()):
x_value.append(family_size)
y_value.append(float(family_size_dict[family_size]) / float(total_tags))
plt.bar(x_value, y_value)
plt.xlabel('Family Size')
plt.ylabel('Proportion of Total Reads')
plt.savefig(o.outfile + '.png', bbox_inches='tight')
plt.bar(x_value, y_value)
plt.xlabel('Family Size')
plt.ylabel('Proportion of Total Reads')
plt.xlim([0,40])
plt.savefig(o.outfile + '.zoom.png', bbox_inches='tight')
except ImportError:
sys.stderr.write('matplotlib not present. Only tagstats file will be generated.')
if o.reduce:
read1_fastq = open(o.outfile + '.seq1.smi.fq', 'r')
read2_fastq = open(o.outfile + '.seq2.smi.fq', 'r')
read1_output = open(o.outfile + '.seq1.reduced.fq', 'w')
read2_output = open(o.outfile + '.seq2.reduced.fq', 'w')
for read1_title, read2_title, read1_seq, read2_seq, read1_qual, read2_qual in fastq_general_iterator(read1_fastq, read2_fastq):
if read1_title.split('|')[1].split('/')[0] in dcs_tags_list and read2_title.split('|')[1].split('/')[0] in dcs_tags_list:
read1_output.write('@%s\n%s\n+\n%s\n' % (read1_title, read1_seq, read1_qual))
read2_output.write('@%s\n%s\n+\n%s\n' % (read2_title, read2_seq, read2_qual))
read1_fastq.close()
read2_fastq.close()
read1_output.close()
read2_output.close()
if __name__ == "__main__":
main()