-
Notifications
You must be signed in to change notification settings - Fork 1
/
BinnedFisher.py
195 lines (125 loc) · 6.25 KB
/
BinnedFisher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import sys
import time
import numpy as np
from scipy import linalg
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from Fisher import Fisher
__all__ = ['BinnedFisher']
#####################################################################################################################
#NOTE TO SELF:
# np.inner(A,B) sums over last indices, i.e. = A[i,j]*B[k,j]
# so if you want to do A*B, you should do np.inner(A, B.T)
# Also, np.inner is faster than np.dot
#####################################################################################################################
class BinnedFisher(BaseEstimator, ClassifierMixin, TransformerMixin):
def __init__(self, norm_covariance = True, n_components=None, priors=None, bins = [-float('inf'), float('inf')] ):
self.nbins = len(bins)-1
self.bins = np.sort(bins)
self.bin_trained = [False for i in range(self.nbins)]
self.fish = [ Fisher(norm_covariance, n_components, priors) for i in range(self.nbins) ]
def fit(self, X, y, tol=[1.0e-4], store_covariance=False, do_smooth_reg=False, cov_class=None, cov_power=1, entries_per_ll_bin = 10):
X = np.asarray(X)
y = np.asarray(y)
if len(tol)==1:
tol = [tol for i in range(self.nbins)]
elif len(tol) != self.nbins:
print "tol must have length 1 or nbins. exiting"
sys.exit(2)
self.tol = tol
self.do_smooth_reg = do_smooth_reg
self.cov_class = cov_class
self.cov_power = cov_power
self.entries_per_ll_bin = entries_per_ll_bin
self.comp = []
self.ll_sig = []
self.ll_bkg = []
self.ll_bin_edges = []
for i in range(self.nbins):
print "Starting fit for bin", i
ts = time.time()
low, high = self.bins[i], self.bins[i+1]
the_entries = (X[:,0] >=low) * (X[:,0] <high)
if len(the_entries) == 0:
print ("Warning: no entries in bin=[%d , %d], not training!", low, high)
continue
Xi = X[ the_entries, 1:]
yi = y[ the_entries ]
self.fish[i].fit(Xi, yi, tol=self.tol[i],
do_smooth_reg=self.do_smooth_reg,
cov_class=self.cov_class,
cov_power=self.cov_power, store_covariance=True)
self.comp.append( self.fish[i].w_[0] ) # should be normed in fisher.py with linalg.norm(self.fish[i].w_[0])
#now making log-likelihood
sigs = np.sort(self._transform_bin(Xi[ yi==1 ], bin_number=i, override=True).flatten())
bkgs = np.sort(self._transform_bin(Xi[ yi==0 ], bin_number=i, override=True).flatten())
self.ll_bin_edges.append( bkgs[0::self.entries_per_ll_bin] )
self.ll_bin_edges[i][0] = np.minimum(bkgs[0], sigs[0])
self.ll_bin_edges[i][-1] = np.maximum(bkgs[-1], sigs[-1])
sig_hist, temp = np.histogram( sigs, bins=self.ll_bin_edges[i], normed=True)
bkg_hist, temp = np.histogram( bkgs, bins=self.ll_bin_edges[i], normed=True)
#set any zero entries to 1e-4 so that log(sig_hist) is always well defined
sig_hist[ sig_hist==0 ] = np.repeat(1e-4, np.count_nonzero(sig_hist==0))
self.ll_sig.append( np.log(sig_hist) )
self.ll_bkg.append( np.log(bkg_hist) )
self.bin_trained[i] = True
print 'Fitting bin %d took %d seconds' % (i, time.time() - ts)
return self
def update_tol(self, tol):
if len(tol)==1:
tol = [tol for i in range(self.nbins)]
elif len(tol) != self.nbins:
print "tol must have length 1 or nbins. exiting"
sys.exit(2)
for i in range(self.nbins):
self.fish[i].update_tol(tol[i])
return self
def transform(self, X, return_ll=False):
t, l = self._transform(X)
if return_ll:
return l
else:
return t
def _transform(self, X):
X = np.asarray(X)
out = np.zeros( X.shape[0] ) # default transformed value is 0
llout = np.ones( X.shape[0] ) # default ll value is 1 (i.e. equal prob)
for i in range(self.nbins):
if not self.bin_trained[i]:
print ("bin %d not trained! Can't transform before running fit!", i)
sys.exit(2)
low, high = self.bins[i], self.bins[i+1]
the_entries = (X[:,0] >=low) * (X[:,0] <high)
Xi = X[ the_entries, 1:]
out[ the_entries ] = self._transform_bin(Xi, i) #self.fish[i].transform(Xi)
llout[ the_entries ] = self._eval_ll_bin( out[ the_entries ].flatten(), i)
return out, llout
def _transform_bin(self, Xi, bin_number, override=False):
'''
ony works after binned var stripped off
'''
if bin_number < 0 or bin_number > (self.nbins-1):
print ("bin number must be between 0 and %d", self.nbins-1)
sys.exit(2)
if not override and not self.bin_trained[bin_number]:
print ("bin %d not trained! Can't transform before running fit!", i)
sys.exit(2)
Xi = np.asarray(Xi)
out = self.fish[bin_number].transform(Xi)
return out
def _eval_ll_bin(self, Ti, bin_number):
'''
only works on transformed data
'''
if bin_number < 0 or bin_number > (self.nbins-1):
print ("bin number must be between 0 and %d", self.nbins-1)
sys.exit(2)
if not self.bin_trained[bin_number]:
print ("bin %d not trained! Can't transform before running fit!", i)
sys.exit(2)
Ti = np.asarray(Ti)
# anything not found, gets value of 1
llout = np.ones( Ti.shape[0] )
for i in range(len(self.ll_bin_edges[bin_number]) - 1):
the_entries = (Ti >= self.ll_bin_edges[bin_number][i]) * (Ti < self.ll_bin_edges[bin_number][i+1])
llout[ the_entries ] = np.repeat( self.ll_sig[bin_number][i] / self.ll_bkg[bin_number][i], np.count_nonzero(the_entries))
return llout