forked from novoalab/EpiNano
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Epinano_Variants.py
153 lines (138 loc) · 6.03 KB
/
Epinano_Variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
import sys
import re
import pysam
import numpy as np
from collections import defaultdict
from collections import Counter
import multiprocessing as mp # import Process
import argparse
import os
import glob
import shutil
import fileinput
import time
desc = """convert bam to per reference site variants frequencies.
In contrast to epinano_var1.2 and previous versions, it does not rely on sam2tsv any more.
It is also less memory demanding!
"""
epilog = '''Author: [email protected]
@BCN, 10/03/2021
'''
def _rm (f):
'''
why can't it be defined withing main()
https://stackoverflow.com/questions/50972934/deleting-files-with-multiprocessing-in-python
'''
os.remove(f)
def mkdir (folder):
pool = mp.Pool(4)
if os.path.exists (folder):
if os.path.isdir (folder):
print (folder,'exists; deleting it')
tmp_files = glob.glob ('.csv')
pool.map (_rm, tmp_files)
shutil.rmtree(folder)
os.mkdir (folder)
else:
print (folder, 'exists, but does not loook like a folder, please rename it to avoid being overwritten!', file=sys.stderr)
sys.exit()
else:
os.mkdir (folder)
def decimal_format (f):
return "{0:.{1}f}".format(f,5)
def has_reads_mapped (bam) :
return int (pysam.flagstat(bam).split('\n')[4][0]) > 0
def has_reads_mapped_to_ref (bam, refseqid):
bamfh = pysam.AlignmentFile(bam,'rb')
cnt = bamfh.count (contig=refseqid, start=0, end=bamfh.get_reference_length(refseqid)-1)
return (cnt>1) #no reads mapped
def split_bam (bam):
fwdbam = re.sub(r'bam$','fwd.bam',bam)
revbam = re.sub(r'bam$','rev.bam',bam)
pysam.view ("-F","16","-h","-b", "-o", fwdbam, bam, catch_stdout=False)
pysam.view ("-f","16","-h","-b", "-o", revbam, bam, catch_stdout=False)
pysam.index (fwdbam)
pysam.index (revbam)
return fwdbam ,revbam
def bam_to_var (bamfn, fafn, refname, start, end, strand, outdir):
bam = pysam.AlignmentFile (bamfn)
fafh = pysam.FastaFile (fafn)
if refname in fafh.references:
refseq = fafh.fetch (refname)
outfn = "{}/{}".format (outdir, re.sub (r'bam$',os.path.basename(refname)+'.per.site.csv', bamfn))
out = open (outfn, 'w')
for pileupcolumn in bam.pileup (refname, start, end, flag_filter = 3844, min_mapping_quality=0, min_base_quality=0):
ins = defaultdict(int)
pileupcolumn.set_min_base_quality (0)
cov = pileupcolumn.get_num_aligned() if pileupcolumn.get_num_aligned() == pileupcolumn.nsegments else None
mis, mat, dele = 0, 0 ,0
refpos = pileupcolumn.reference_pos
refbase = refseq[refpos].upper() #if refpos is not None else 'None'
qq = pileupcolumn.get_query_qualities()
mean_q, median_q, std_q = [decimal_format(x) for x in [np.mean(qq), np.median(qq), np.std(qq)]]
print (refname, refpos, strand, refbase, cov, mean_q, median_q, std_q, end = ",", sep=",", file=out)
base_counts = Counter (pileupcolumn.get_query_sequences(mark_matches=False, add_indels = False))
for base, cnt in base_counts.items():
if len (base) == 1 and base in 'ACGTacgt':
if base != refbase:
mis += cnt #wired '' is in 'acgt'
elif base == refbase:
mat += cnt
base_counts = Counter (pileupcolumn.get_query_sequences(mark_matches=True, add_indels = True))
for base, cnt in base_counts.items ():
if '+' in base:
ins[refpos] = ins.get (refpos,0) + cnt
if '*' == base:
dele = cnt
print (decimal_format(mat/cov), decimal_format(mis/cov), decimal_format(ins[refpos]/cov), decimal_format(dele/cov), sep=",", file=out)
if refpos - 1 in ins: del ins[refpos-1]
return outfn
def multi_processes_bam2var(fafn, ref_names, bam, strand, ncpus, outdir):
pool = mp.Pool (processes=ncpus)
fafh = pysam.FastaFile(fafn)
conditions = []
for i, j in enumerate(ref_names):
if has_reads_mapped_to_ref(bam, j) and j in fafh.references:
conditions.append ([bam, fafn, j,0,len(fafh.fetch(j)), strand, outdir])
files = pool.starmap (bam_to_var,conditions)
finaloutfn = re.sub(r'bam$','',os.path.basename(bam)) +'per.site.csv'
finalout = open (finaloutfn,'w')
#finalout2 = open (finaloutfn2, 'w')
print ("#Ref,pos,strand,base,cov,q_mean,q_median,q_std,mat,mis,ins,del",file=finalout)
try:
for l in fileinput.input(files):
print (l.rstrip(), file=finalout)
except:
numberlines = len (list (fileinput.input(files)))
print (l, bam,'has', numberlines, 'lines', file=sys.stderr)
def main ():
usage = "%(prog)s -v"
parser = argparse.ArgumentParser(description=desc, epilog=epilog,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-b', '--bam', required=True, type=str, help='bam file with *bai index')
parser.add_argument('-r', '--reference', required=True, type=str, help='reference file with samtools faidx index')
parser.add_argument('-c', '--cpus', type=int, default=4, help='number of CPUs to use [%(default)s]')
args = parser.parse_args()
fafn = args.reference
bam = args.bam
ncpus = args.cpus
start = time.time()
bamfh = pysam.AlignmentFile (bam, 'rb')
ref_names = bamfh.references
fwdbam, revbam = split_bam(bam)
outdir = bam.replace('.bam','') + '.tmp'
mkdir (outdir)
strands = ['+', '-']
for i, bam in enumerate ([fwdbam, revbam]):
if has_reads_mapped (bam):
multi_processes_bam2var(fafn, ref_names, bam, strands[i], ncpus, outdir)
end = time.time()
print ('analysis took {} seconds'.format(end - start ))
files = glob.glob("{}/*.csv".format (outdir))
pool = mp.Pool (ncpus)
pool.map (_rm, files)
shutil.rmtree (outdir)
pool.map (_rm, [fwdbam, revbam, fwdbam+'.bai', revbam+'.bai'])
if __name__ == "__main__":
main ()