This repository has been archived by the owner on Sep 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
classify.py
268 lines (220 loc) · 11.3 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import datetime
import matplotlib.pyplot as plt
import numpy as np
import multiprocessing
import pickle
from sklearn import (cross_validation, ensemble, metrics, svm, tree,
linear_model, neighbors, naive_bayes,
preprocessing)
import evaluation, database
def imputation(df):
"""Handle missing values in our data. This is mostly a
placeholder for when we have a better way to handle this.
Args:
df [pandas DataFrame]: input data with NaNs
Returns:
df [pandas DataFrame]: output data with NaNs filled to 0
"""
return df.fillna(0)
class Experiment:
def __init__(self, model_timestamp, world, model_type,
hyperparameters, feature_scaling=True,
n_cores=multiprocessing.cpu_count(), k=10):
"""
Args:
model [string]: machine learning algorithm to be used
parameters [dict]: hyperparameter set to be used for the
machine learning algorithm
k [int]: number of k-folds
world [dict]: world type (open- or closed- world)
and parameters if necessary
"""
self.model_timestamp = model_timestamp
self.hyperparameters = hyperparameters
self.model_type = model_type
self.world_type = world["type"]
self.frac_obs = world["observed_fraction"]
self.n_cores = n_cores
self.k = k
self.feature_scaling = feature_scaling
self.db = database.ModelStorage()
self.train_class_balance = 'DEFAULT'
self.base_rate = 'DEFAULT'
def train_single_fold(self, x_train, y_train):
"""Trains a model and saves it as self.trained_model
Args:
x_train [ndarray]: features in training set (no id column, no target)
y_train [ndarray]: target variable in training set (no id column)
"""
print("Training {} classifier with {}".format(self.model_type,
self.hyperparameters))
modelobj = self._get_model_object(self.model_type,
self.hyperparameters,
self.n_cores)
trained_model = modelobj.fit(x_train, y_train)
return trained_model
def score(self, x_test, trained_model):
"""Generates continuous risk scores for a testing set.
Args:
x_test [ndarray]: testing features
trained_model [sklearn object]: trained classifier object
Returns:
result_y [ndarray]: predictions on test set
"""
result_y = trained_model.predict_proba(x_test)
score_positive_class = result_y[:, 1]
return score_positive_class
def train_eval_all_folds(self, x_val, y_val):
"""Trains and evaluates models over all folds.
Args:
timestamp [string]: timestamp of the time the model is run,
used as model identifier
x_val [ndarray]: feature matrix
y_val [ndarray]: label vector
Returns:
auc [float]: area under the ROC curve
mean_fpr [list of floats]: false positive rate averaged over
all kfolds
mean_tpr [list of floats]: true positive rate averaged over
all kfolds
trained_model [sklearn object]: trained object to be pickled
so that it can be used for
scoring
"""
if self.world_type == "closed":
# Why we use stratified k-fold here:
# http://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation
cv = cross_validation.StratifiedKFold(y_val, n_folds=self.k,
shuffle=True)
elif self.world_type == "open":
pass # TODO
fpr_arr, tpr_arr, metrics_all_folds = [], [], []
for i, (train, test) in enumerate(cv):
fold_timestamp = datetime.datetime.now().isoformat()
y_train, y_test = y_val[train], y_val[test]
if self.feature_scaling:
scaler = preprocessing.StandardScaler().fit(x_val[train])
x_train = scaler.transform(x_val[train])
x_test = scaler.transform(x_val[test])
else:
x_train, x_test = x_val[train], x_val[test]
trained_model = self.train_single_fold(x_train, y_train)
pred_probs = self.score(x_test, trained_model)
filename_kfold = '{}_{}_undefended_frontpage_{}_model_{}_fold_{}_world.pkl'.format(
fold_timestamp, self.model_timestamp, self.model_type, i, self.world_type)
fold_to_save = {'trained_object': trained_model,
'y_true': y_test, 'y_predicted': pred_probs}
self.pickle_results(filename_kfold, fold_to_save)
# Metrics computation
# Compute ROC curve and area under the ROC curve
eval_metrics = evaluation.get_metrics(y_test, pred_probs)
metrics_all_folds.append(eval_metrics)
fpr_arr.append(eval_metrics['fpr'])
tpr_arr.append(eval_metrics['tpr'])
# Save results of metrics in database
self.db.save_fold_of_model(eval_metrics, self.model_timestamp, fold_timestamp)
auc = evaluation.plot_allkfolds_ROC(self.model_timestamp, cv,
fpr_arr, tpr_arr)
print("Classifier {} trained! AUC: {}".format(self.model_timestamp,
auc))
avg_metrics = evaluation.get_average_metrics(metrics_all_folds)
# Save results of experiment (model evaluation averaged over all
# folds) into the database
self.db.save_full_model(avg_metrics, self.model_timestamp, self.__dict__)
def pickle_results(self, pkl_file, to_save):
with open(pkl_file, 'wb') as f:
pickle.dump(to_save, f, protocol=pickle.HIGHEST_PROTOCOL)
def _get_model_object(self, model, parameters, n_cores):
"""This method takes the requested model type and
hyperparameters and produces the relevant classifier object.
Returns:
object with the fit() and predict_proba() methods
implemented on it
"""
if self.model_type == "RandomForest":
return ensemble.RandomForestClassifier(
n_estimators=self.hyperparameters['n_estimators'],
max_features=self.hyperparameters['max_features'],
criterion=self.hyperparameters['criterion'],
max_depth=self.hyperparameters['max_depth'],
min_samples_split=self.hyperparameters['min_samples_split'],
n_jobs=self.n_cores)
elif self.model_type == "RandomForestBagging":
return ensemble.BaggingClassifier(
ensemble.RandomForestClassifier(
n_estimators=self.hyperparameters['n_estimators'],
max_features=self.hyperparameters['max_features'],
criterion=self.hyperparameters['criterion'],
max_depth=self.hyperparameters['max_depth'],
min_samples_split=self.hyperparameters['min_samples_split'],
n_jobs=self.n_cores),
#Bagging parameters
n_estimators=self.hyperparameters['n_estimators_bag'],
max_samples=self.hyperparameters['max_samples'],
max_features=self.hyperparameters['max_features_bag'],
bootstrap=self.hyperparameters['bootstrap'],
bootstrap_features=self.hyperparameters['bootstrap_features'],
n_jobs=self.n_cores
)
elif self.model_type == "RandomForestBoosting":
return ensemble.AdaBoostClassifier(
ensemble.RandomForestClassifier(
n_estimators=self.hyperparameters['n_estimators'],
max_features=self.hyperparameters['max_features'],
criterion=self.hyperparameters['criterion'],
max_depth=self.hyperparameters['max_depth'],
min_samples_split=self.hyperparameters['min_samples_split'],
n_jobs=self.n_cores),
#Boosting parameters
learning_rate=self.hyperparameters['learning_rate'],
algorithm=self.hyperparameters['algorithm'],
n_estimators=self.hyperparameters['n_estimators_boost']
)
elif self.model_type == 'SVM':
return svm.SVC(C=self.hyperparameters['C_reg'],
kernel=self.hyperparameters['kernel'],
probability=True)
elif self.model_type == 'LogisticRegression':
return linear_model.LogisticRegression(
C=self.hyperparameters['C_reg'],
penalty=self.hyperparameters['penalty'])
elif self.model_type == 'AdaBoost':
return ensemble.AdaBoostClassifier(
learning_rate=self.hyperparameters['learning_rate'],
algorithm=self.hyperparameters['algorithm'],
n_estimators=self.hyperparameters['n_estimators'])
elif self.model_type == 'ExtraTrees':
return ensemble.ExtraTreesClassifier(
n_estimators=self.hyperparameters['n_estimators'],
max_features=self.hyperparameters['max_features'],
criterion=self.hyperparameters['criterion'],
max_depth=self.hyperparameters['max_depth'],
min_samples_split=self.hyperparameters['min_samples_split'],
n_jobs=self.n_cores)
elif self.model_type == 'GradientBoostingClassifier':
return ensemble.GradientBoostingClassifier(
n_estimators=self.hyperparameters['n_estimators'],
learning_rate=self.hyperparameters['learning_rate'],
subsample=self.hyperparameters['subsample'],
max_depth=self.hyperparameters['max_depth'])
elif self.model_type == 'GaussianNB':
return naive_bayes.GaussianNB()
elif self.model_type == 'DecisionTreeClassifier':
return tree.DecisionTreeClassifier(
max_features=self.hyperparameters['max_features'],
criterion=self.hyperparameters['criterion'],
max_depth=self.hyperparameters['max_depth'],
min_samples_split=self.hyperparameters['min_samples_split'])
elif self.model_type == 'SGDClassifier':
return linear_model.SGDClassifier(
loss=self.hyperparameters['loss'],
penalty=self.hyperparameters['penalty'],
n_jobs=self.n_cores)
elif self.model_type == 'KNeighborsClassifier':
return neighbors.KNeighborsClassifier(
n_neighbors=self.hyperparameters['n_neighbors'],
weights=self.hyperparameters['weights'],
algorithm=self.hyperparameters['algorithm'],
n_jobs=self.n_cores)
else:
raise ValueError("Unsupported classifier {}".format(self.model_type))