-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsamtools.py
343 lines (283 loc) · 14 KB
/
samtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
'''
The Samtools package.
TO DO: much of this stuff can be all eliminated by using pysam instead, as
pysam (in its current versions) is meant to be the complete python
implementation of htslib/samtools.
http://pysam.readthedocs.org/en/latest/usage.html#using-samtools-commands-within-python
Current bug with pysam 0.8.1: nosetests does not work unless you use --nocapture.
python -m unittest works. Something about redirecting stdout.
Actually, Travis CI still has issues with pysam and stdout even with --nocapture.
pysam 0.9.1 stops redirecting stdout which makes things much easier,
but it's a pain to pip install.
'''
import logging
import shutil
import os
import re
import os.path
import subprocess
import tempfile
import contextlib
from collections import OrderedDict
from decimal import *
import pysam
import tools
import util.file
import util.misc
TOOL_NAME = 'samtools'
log = logging.getLogger(__name__)
class SamtoolsTool(tools.Tool):
def __init__(self, install_methods=None):
if install_methods is None:
install_methods = [tools.PrexistingUnixCommand(shutil.which(TOOL_NAME), require_executability=True)]
super(SamtoolsTool, self).__init__(install_methods=install_methods)
def _get_tool_version(self):
self.tool_version = subprocess.check_output([self.install_and_get_path(), '--version']).decode('UTF-8').split('\n')[0].split()[1]
def execute(self, command, args, stdout=None, stderr=None, background=False): # pylint: disable=W0221
tool_cmd = [self.install_and_get_path(), command] + args
log.debug(' '.join(tool_cmd))
if stdout:
stdout = open(stdout, 'w')
if stderr:
stderr = open(stderr, 'w')
env = os.environ.copy()
env.pop('JAVA_HOME', None)
if background:
subprocess.Popen(tool_cmd, stdout=stdout, stderr=stderr, env=env)
else:
subprocess.check_call(tool_cmd, stdout=stdout, stderr=stderr, env=env)
if stdout:
stdout.close()
if stderr:
stderr.close()
def view(self, args, inFile, outFile, regions=None, threads=None, background=False):
regions = regions or []
args = args or []
# -@ seems to result in segfaults in some cases
# so this is commented out for now...
self.execute('view', args + ['-o', outFile, inFile] + regions, background=background)
#opts = args + ['-o', outFile, inFile] + regions
#pysam.view(*opts)
def bam2fq(self, inBam, outFq1, outFq2=None):
if outFq2 is None:
self.execute('bam2fq', ['-n', inBam], stdout=outFq1)
else:
self.execute('bam2fq', ['-1', outFq1, '-2', outFq2, inBam])
def bam2fq_pipe(self, inBam):
tool_cmd = [self.install_and_get_path(), 'bam2fq', '-n', inBam]
log.debug(' '.join(tool_cmd) + ' |')
p = subprocess.Popen(tool_cmd, stdout=subprocess.PIPE)
return p
def bam2fa(self, inBam, outFa1, outFa2=None, outFa0=None, append_mate_num=False):
if outFa2 is None:
args = ['-N' if append_mate_num else '-n']
else:
args = ['-1', outFa1, '-2', outFa2]
if outFa0:
args += ['-0', outFa0]
args += [inBam]
if outFa2 is None:
self.execute('fasta', args, stdout=outFa1)
else:
self.execute('fasta', args)
def bam2fa_pipe(self, inBam):
tool_cmd = [self.install_and_get_path(), 'fasta', '-n', inBam]
log.debug(' '.join(tool_cmd) + ' |')
p = subprocess.Popen(tool_cmd, stdout=subprocess.PIPE)
return p.stdout
@contextlib.contextmanager
def bam2fq_tmp(self, inBam):
with util.file.tempfnames(('.1.fq', '.2.fq')) as (reads1, reads2):
self.bam2fq(inBam, reads1, reads2)
yield reads1, reads2
@contextlib.contextmanager
def bam2fa_tmp(self, inBam):
with util.file.tempfnames(('.1.fa', '.2.fa')) as (reads1, reads2):
self.bam2fa(inBam, reads1, reads2)
yield reads1, reads2
def sort(self, inFile, outFile, args=None, threads=None):
# inFile can be .sam, .bam, .cram
# outFile can be .sam, .bam, .cram
args = args or []
if '-@' not in args:
args.extend(('-@', str(util.misc.sanitize_thread_count(threads))))
if '-T' not in args and os.path.isdir(tempfile.tempdir):
args.extend(('-T', tempfile.tempdir))
args.extend(('-o', outFile, inFile))
self.execute('sort', args)
def merge(self, inFiles, outFile, options=None):
"Merge a list of inFiles to create outFile."
options = options or ['-f']
# We are using -f for now because mkstempfname actually makes an empty
# file, and merge fails with that as output target without the -f.
# When mkstempfname is fixed, we should remove the -f.
self.execute('merge', options + [outFile] + inFiles)
def index(self, inBam):
# inBam can be .bam or .cram
self.execute('index', [inBam])
def faidx(self, inFasta, overwrite=False):
''' Index reference genome for samtools '''
outfname = inFasta + '.fai'
if os.path.isfile(outfname):
if overwrite:
os.unlink(outfname)
else:
return
#pysam.faidx(inFasta)
with util.file.fastas_with_sanitized_ids(inFasta, use_tmp=True) as sanitized_fastas:
sanitized_fasta = sanitized_fastas[0]
self.execute('faidx', [sanitized_fasta])
shutil.copyfile(sanitized_fasta + '.fai', outfname)
def depth(self, inBam, outFile, options=None):
""" Write a TSV file with coverage depth by position """
options = options or ["-aa", "-m", "1000000"]
self.execute('depth', options + [inBam], stdout=outFile)
def idxstats(self, inBam, statsFile):
self.execute('idxstats', [inBam], stdout=statsFile)
def reheader(self, inBam, headerFile, outBam):
self.execute('reheader', [headerFile, inBam], stdout=outBam)
def dumpHeader(self, inBam, outHeader):
opts = []
if inBam.endswith('.bam'):
opts = ['-H']
elif inBam.endswith('.sam'):
opts = ['-H', '-S']
#header = pysam.view(*opts)
self.view(opts, inBam, outHeader)
def removeDoublyMappedReads(self, inBam, outBam):
#opts = ['-b', '-f 2']
opts = ['-b', '-F' '1028', '-f', '2', '-@', '3']
self.view(opts, inBam, outBam)
def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True, reject_duplicates=True):
'''
This function writes a bam file filtered to include only reads that are:
- not flagged as duplicates
- not secondary or supplementary (split/chimeric reads)
- For paired-end reads:
- marked as proper pair (if require_pairs_to_be_proper=True) OR
both not unmapped (if require_pairs_to_be_proper=False) OR
not a member of a pair with a singleton (if reject_singletons=True)
- For single-end reads:
mapped
'''
with pysam.AlignmentFile(inBam, 'rb', check_sq=False) as inb:
with pysam.AlignmentFile(outBam, 'wb', header=inb.header) as outf:
# process the lines individually and write them or not, depending on the flags
# For explanation of flags, see:
# https://broadinstitute.github.io/picard/explain-flags.html
# https://pysam.readthedocs.io/en/latest/api.html
# https://samtools.github.io/hts-specs/SAMv1.pdf
# https://github.com/pysam-developers/pysam/blob/31183d7fac52b529b304bdf61ff933818ae4a71f/samtools/stats.c#L72-L81
for read in inb:
# check if a read is paired
is_single_end=not read.is_paired
# if a PCR/optical duplicate, do not write
if reject_duplicates and read.is_duplicate:
continue
# if a read is a secondary or supplementary mapping (split/chimeric), do not write
if read.is_secondary or read.is_supplementary:
continue
# do not write if
# paired-end
if (read.is_paired and
# reject anything not marked as proper pair (this bit is not guaranteed)
(require_pairs_to_be_proper and not read.is_proper_pair) or
# reject pairs where both mates are unmapped
(read.mate_is_unmapped and read.is_unmapped) or
# reject reads where only one mate is mapped (singletons)
(reject_singletons and (read.mate_is_unmapped or read.is_unmapped))):
continue
if is_single_end and read.is_unmapped: # or if this is single-end and unmapped, reject
continue
# otherwise write the read to the output
outf.write(read)
def filterByCigarString(self, inBam, outBam,
regexToMatchForRemoval='^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$',
invertResult=False):
'''
This function applies a regex to the cigar string for each read.
If the regex matches, the read is not written to the output unless
invertResult is True.
The default regex matches cigar strings with trailing or leading indels:
'^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$'
'''
regex = re.compile(regexToMatchForRemoval)
with pysam.AlignmentFile(inBam, 'rb', check_sq=False) as inb:
with pysam.AlignmentFile(outBam, 'wb', header=inb.header) as outf:
# process the lines individually and write them or not, depending on
# whether they match regexToMatchForRemoval
for read in inb:
if read.cigarstring:
# perform a regex match
matches = regex.search(read.cigarstring)
# if the regex was found (or not, if inverted)
if (not invertResult and matches) or (invertResult and not matches):
# continue to the next read (don't write out this one)
continue
else:
if invertResult:
continue
# otherwise write out the line
outf.write(read)
def downsample(self, inBam, outBam, probability):
if not probability:
raise Exception("Probability must be defined")
if float(probability) <= 0 or float(probability) > 1:
raise Exception("Probability must be in range (0,1]. This value was given: %s" % probability)
opts = ['-s', str(1) + '.' + str(probability).split('.')[1], '-@', '3'] # -s subsamples: seed.fraction
self.view(opts, inBam, outBam)
def downsample_to_approx_count(self, inBam, outBam, read_count):
total_read_count = self.count(inBam)
probability = Decimal(int(read_count)) / Decimal(total_read_count)
probability = 1 if probability > 1 else probability
if probability < 1:
self.downsample(inBam, outBam, probability)
else:
_log.info("Requested downsample count exceeds number of reads. Including all reads in output.")
shutil.copyfile(inBam, outBam)
def getHeader(self, inBam):
''' fetch BAM header as a list of tuples (already split on tabs) '''
tmpf = util.file.mkstempfname('.txt')
self.dumpHeader(inBam, tmpf)
with open(tmpf, 'rb') as inf:
header = list(line.decode("latin-1").rstrip('\n').split('\t') for line in inf)
os.unlink(tmpf)
return header
def getReadGroups(self, inBam):
''' fetch all read groups from the BAM header as an OrderedDict of
RG ID -> RG dict. The RG dict is a mapping of read group keyword
(like ID, DT, PU, LB, SM, CN, PL, etc) to value. ID is included
and not stripped out. ID is required for all read groups.
Resulting keys are in same order as @RG lines in bam file.
'''
rgs = [
dict(x.split(':', 1) for x in row[1:]) for row in self.getHeader(inBam) if len(row) > 0 and row[0] == '@RG'
]
return OrderedDict((rg['ID'], rg) for rg in rgs)
def count(self, inBam, opts=None, regions=None):
opts = opts or []
regions = regions or []
cmd = [self.install_and_get_path(), 'view', '-c'] + opts + [inBam] + regions
return int(subprocess.check_output(cmd).strip())
#if inBam.endswith('.sam') and '-S' not in opts:
# opts = ['-S'] + opts
#cmd = ['-c'] + opts + [inBam] + regions
#return int(pysam.view(*cmd)[0].strip())
def mpileup(self, inBam, outPileup, opts=None):
opts = opts or []
self.execute('mpileup', opts + [inBam], stdout=outPileup, stderr='/dev/null') # Suppress info messages
def isEmpty(self, inBam):
if not os.path.isfile(inBam):
return True
else:
tmpf = util.file.mkstempfname('.txt')
self.dumpHeader(inBam, tmpf)
header_size = os.path.getsize(tmpf)
if (os.path.getsize(inBam) > (100 + 5 * header_size)):
# large BAM file: assume it is not empty
# a BAM file definitely has reads in it if its filesize is larger
# than just the header itself
return False
else:
# small BAM file: just count and see if it's non-zero
return (0 == self.count(inBam))