-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmotif_finder.py
259 lines (214 loc) · 5.25 KB
/
motif_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import gzip
import math
import itertools
import random
import sys
import re
import pwm
import mm
import tools
#################
# Motif Finding #
#################
def motifembedder(pwm, p, size, choice='ACGT', strand='='):
seq = ''
locs = []
while len(seq) < size:
if len(seq) > 0 and len(seq) < size - pwm.length and random.random() < p:
kmer = pwm.generate()
loc = len(seq)
if strand == '=': s = random.choice('+-')
elif strand == '+': s = '+'
else: s = '-'
if s == '+':
seq += kmer
locs.append(loc)
else:
seq += tools.anti(kmer)
locs.append(-loc)
else:
seq += random.choice(choice)
return seq, locs
"""
A scoring function
requires
seqs: sequences in case it needs lengths
locations: where on each sequence the motif is found
exp: expected probability of motif (given pwm and bkgd model)
returns
score
"""
def zoops(seqs, locs, exp):
n = 0
x = 0
for seq, loc in zip(seqs, locs):
if len(loc) > 0: n += 1
x += exp * len(seq)
if n == 0: return 0 # really?
return math.log2(n / x)
# anr without distance
def dn(loc):
# scoring equation
score = len(loc)
return score
# linear distribution for anr with distance
def dl(loc, dist):
# scoring equation
if (loc >= dist): score = 0
else: score = (dist - loc) / (dist)
return score
# quadratic distribution for anr with distance
def dq(loc, dist):
# scoring equation
if (loc >= dist): score = 0
else: score = (1 / dist)*(loc + dist)*((dist - loc) / dist)
return score
# uniform distribution for anr with distance
def du(loc, dist):
# scoring equation
if (loc >= dist): score = 0
else: score = (1 / dist)
return score
def anr_general(seqs, locs, exp, distr, d):
n = 0
x = 0
# distr is probability distribution for scoring
# default is 'none' meaning anr without distance
if distr == 'none': pd = dn
elif distr == 'linear': pd = dl
elif distr == 'quadratic': pd = dq
elif distr == 'uniform': pd = du
for seq, loc in zip(seqs, locs):
# d is the distance after which scores = 0
if d == 0: dist = len(seq)
else: dist = d
# scoring with anr no distance
if distr == "none":
n += pd(loc)
# scoring anr with distance
else:
for l in loc:
n += pd(l, dist)
x += exp * dist
if (n == 0): return 0
return math.log2(n/x)
"""
A motif-finder
requires
seqs: a list of sequences of arbitrary length
bkgd: an n-th order background model
sfunc: scoring function
k: some length
options:
stranded
filtering heuristics (entropy, expectation)
threshold score?
returns
motifs with some minimum score?
best n motifs?
Given a background model, what is the prob of
kmer: ACG
regex: A[S]G
dpwm: AcG
pwm: complete pwm
gpwm: gappable pwm
"""
def kmer_finder(seqs, bkgd, func, k, n=10, distr="none", d=0):
# get all of the kmers present (rather than all possible)
kmers = {}
for seq in seqs:
for i in range(len(seq) -k +1):
kmers[ seq[i:i+k] ] = True
# calculate scores of all kmers, and keep the good ones
keep = []
for kmer in kmers:
locs = []
for seq in seqs:
pos = []
off = 0
while True:
idx = seq.find(kmer, off)
if idx == -1: break
pos.append(idx)
off = idx + len(kmer)
locs.append(pos)
score = func(seqs, locs, bkgd.seq_prob(kmer), distr, d)
keep.append( (kmer, score) )
# prevent the list from growing too much
if len(keep) > 1000:
keep = sorted(keep, key=lambda t: t[1], reverse=True)
keep = keep[:n]
# final sort-n-trim
keep = sorted(keep, key=lambda t: t[1], reverse=True)
return keep[:n]
XNT = {
'A': 0.25,
'C': 0.25,
'G': 0.25,
'T': 0.25,
'R': 0.50,
'Y': 0.50,
'M': 0.50,
'K': 0.50,
'W': 0.50,
'S': 0.50,
'B': 0.75,
'D': 0.75,
'H': 0.75,
'V': 0.75,
'N': 1.00,
}
def regex_finder(seqs, bkgd, func, k, n=10, x=0.35, alph='ACGTRYMKWSN',
distr="none", d=0):
keep = []
for t in itertools.product(alph, repeat=k):
s = ''.join(t)
p = 1.0
for letter in s: p *= XNT[letter]
if p > x ** len(s): continue
regex = ''
for letter in t: regex += tools.NT2RE[letter]
locs = []
for seq in seqs:
pos = []
for m in re.finditer(regex, seq):
pos.append(m.span()[0])
locs.append(pos)
score = func(seqs, locs, bkgd.re_prob(regex), distr, d)
keep.append( (regex, score) )
# prevent the list from growing too much
if len(keep) > 1000:
keep = sorted(keep, key=lambda t: t[1], reverse=True)
keep = keep[:n]
# final sort-n-trim
keep = sorted(keep, key=lambda t: t[1], reverse=True)
return keep[:n]
def dpwm_finder(seqs, bkgd, func, k, t, n=10, alph='ACGTRYMKWSN', distr="none", d=0):
# t = threshold
keep = []
for seq in seqs:
for i in range(len(seq)-k+1):
kmer = seq[i:i+k]
kmer_pwm = pwm.string2pwm(kmer)
locs = []
for seq1 in seqs:
pos = []
for m in re.finditer(kmer, seq1):
pos.append(m.span()[0])
locs.append(pos)
# Need to get score of kmer based on dsPWM
score = func(seqs, locs, bkgd.pwm_prob(kmer), distr, d)
if score > t and kmer not in keep: keep.append(kmer)
return keep
def motiffinder(seqs, k):
freqs = {}
total = 0
for seq in seqs:
seq = seq.upper()
for i in range(len(seq)-k+1):
kmer = seq[i:i+k]
if kmer not in freqs:
freqs[kmer] = 0
freqs[kmer] += 1
for kmer in freqs:
print(kmer, freqs[kmer])