-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_config.py
276 lines (255 loc) · 11.8 KB
/
make_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# -*- coding: utf-8 -*-
u"""Make json config file with experiment configuration.
Usage:
make_config.py <x_train.csv> <y_train.csv> <x_test.csv> <y_test.csv> <class_names.csv> <config.json> [--classifier=<classifier> --classifier-filename=<classifier-filename> -n --random-state=<random_state> --grid-scoring=<grid_scoring> --scaling=<scaling>]
Arguments:
<x_train.csv> Name of csv file with feature values for the training set. No index, first row - feature names.
<y_train.csv> Name of csv file with target classes values for the training set. No index, no header.
<x_test.csv> Name of csv file with feature values for the test set. No index, first row - feature names.
<y_test.csv> Name of csv file with target classes values for the test set. No index, no header.
<class_names.csv> Name of csv file with target class names (index=class index, value=class name)
<config.json> Name of json file to write configuration to.
Options:
-h --help Show this screen.
--version Show Version.
--classifier=<classifier> Classifier name to use.
--classifier-filename=<classifier-filename> Filename to store trained classifier (optional)
-n Disable grid hyperparameter search.
--random-state=<random_state> Use preselected random state.
--grid-scoring=<grid_scoring> Grid hyperparameter set evaluation method. Examples: f1_weighted, accuracy, cohen_kappa.
--scaling=<scaling> Use specific value for feature scaling (instead of setting both True and False).
"""
import json
import random
import sys
from collections import OrderedDict
from hashlib import sha256
from itertools import product
from numpy import linspace
import pandas
from docopt import docopt
def classifiers_config(random_state, classifier_name=None, skip_grid=False):
"""Get classifiers config dict with parameter grid.
Classifier config dict values can be iterables, meaning
that a grid search should be performed.
If they are not iterables (e.g. random_state is int),
the values are just to be passed to a classifier constructor.
Args:
random_state: random_state config option to include to some classifiers.
classifier_name: if specified, use only this classifier.
skip_grid: if True, do not add grid parameters to search.
Returns:
dict {<classifier_name>: {'init': classifier init options,
'grid': dict of classifier parameters grid}}.
"""
classifiers = {
'tree.DecisionTreeClassifier': { # attributes: feature_importances_
'init': {
'random_state': random_state
},
'grid': {
'criterion': ('gini', 'entropy'),
'splitter': ('best', 'random'),
'max_features': ('auto', 'sqrt', 'log2', None),
'class_weight': ('balanced', None)
}
},
'neighbors.KNeighborsClassifier': { # no attributes
'grid': {
'n_neighbors': range(1, 101, 3),
'weights': ('uniform', 'distance'),
'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
'p': range(1, 11)
},
'scaling': False, # it hangs otherwise
},
'svm.LinearSVC': { # attributes: coef_
'init': {
'random_state': random_state
},
'grid': {
'C': range(11),
'loss': ('hinge', 'squared_hinge'),
'penalty': ('l1', 'l2'),
'dual': (True, False),
'multi_class': ('ovr', 'crammer_singer'),
'class_weight': ('balanced', None)
},
'scaling': False, # it hangs otherwise
},
'svm.SVC': { # no attributes
'init': {
'random_state': random_state
},
'grid': {
'C': range(11),
# exclude linear kernel, because we have LinearSVC, and
# for SVC linear kernel is very slow
'kernel': ('poly', 'rbf', 'sigmoid'),
'shrinking': (True, False),
'class_weight': ('balanced', None),
'decision_function_shape': ('ovo', 'ovr')
}
},
'ensemble.RandomForestClassifier': { # attributes: feature_importances_
'init': {
'random_state': random_state
},
'grid': {
'n_estimators': range(2, 50, 4),
'criterion': ('gini', 'entropy'),
'max_features': ('auto', 'sqrt', 'log2', None),
'bootstrap': (True, False),
'oob_score': (True, False),
'class_weight': ('balanced', 'balanced_subsample', None),
},
'scaling': False, # it hangs otherwise
},
'ensemble.ExtraTreesClassifier': { # attributes: feature_importances_
'init': {
'random_state': random_state
},
'grid': {
'n_estimators': range(2, 50, 4),
'criterion': ('gini', 'entropy'),
'max_features': ('auto', 'sqrt', 'log2', None),
'bootstrap': (True, False),
'oob_score': (True, False),
'class_weight': ('balanced', 'balanced_subsample', None),
},
'scaling': False, # it hangs otherwise
},
'ensemble.AdaBoostClassifier': { # attributes: feature_importances_
'init': {
'random_state': random_state
},
'grid': {
'n_estimators': range(10, 101, 10),
'learning_rate': [1] + list(linspace(0.1, 10, 21)),
'algorithm': ('SAMME', 'SAMME.R'),
},
'scaling': False, # it hangs otherwise
},
'ensemble.GradientBoostingClassifier': { # attributes: feature_importances_
'init': {
'random_state': random_state
},
'grid': {
'loss': ('deviance', 'exponential'),
'learning_rate': [0.01, 0.1, 0.2, 0.5],
# 'learning_rate': [0.1] + list(linspace(0.01, 0.5, 5)),
'n_estimators': [100, 300, 500],
# 'n_estimators': range(50, 501, 50),
'max_depth': [1, 3, 5, 7, 10],
# 'max_depth': range(1, 11),
'subsample': [0.1, 0.5, 1],
# 'subsample': list(linspace(0.1, 1, 6)),
'max_features': ('auto', 'sqrt', 'log2', None),
},
'scaling': False, # it hangs otherwise
},
'naive_bayes.GaussianNB': {}, # no attributes
'discriminant_analysis.LinearDiscriminantAnalysis': { # attributes: coef_
'grid': {
'solver': ('svd', 'lsqr', 'eigen'),
'shrinkage': (None, 'auto'),
}
},
'discriminant_analysis.QuadraticDiscriminantAnalysis': {}, # no attributes
}
if skip_grid:
for name, config in classifiers.items():
if 'grid' in config:
config['grid'] = {}
if classifier_name:
classifiers = {name: config for name, config in classifiers.items()
if name == classifier_name}
return classifiers
def get_file_hash(filename):
"""Calculate sha256 hash for a file by given filename."""
return sha256(open(filename, 'rb').read()).hexdigest()
def make_config(options):
"""Make json configuration file for classification.
Most parameters are the same and are taken from the command-line arguments,
e.g. filenames of features, classes and configuration file.
Some parameters are hard-coded (e.g. class names).
And some are hard-coded and are cycled by (scaling, classifier names).
So that every combination of parameters is written into config file.
Args:
options: CLI options dict.
"""
random_state = int(options["--random-state"] or random.randint(0, 1000000))
skip_grid = options["-n"]
grid_scoring = options["--grid-scoring"]
if grid_scoring:
grid_scoring_options = (grid_scoring,)
elif skip_grid:
grid_scoring_options = ("f1_weighted",)
else:
grid_scoring_options = ("f1_weighted", "cohen_kappa", "accuracy")
scaling = options["--scaling"]
feature_scaling_options = (bool(scaling),) if scaling is not None else (False, True)
class_names = pandas.read_csv(options['<class_names.csv>']).to_dict()['name']
x_train = pandas.read_csv(options["<x_train.csv>"])
y_train = pandas.read_csv(options["<y_train.csv>"], header=None).ix[:, 0]
y_test = pandas.read_csv(options["<y_test.csv>"], header=None).ix[:, 0]
features = x_train.columns
# class_code: number of examples per class, e.g. {0: 10, 1: 20, 2: 35, 3: 12}
train_classes_counts_raw = y_train.value_counts().to_dict()
# class name: number of examples
train_classes_counts = [(name, train_classes_counts_raw[i])
for i, name in class_names.items()]
test_classes_counts_raw = y_test.value_counts().to_dict()
test_classes_counts = [(name, test_classes_counts_raw[i]) for i, name in enumerate(class_names)]
classifiers = classifiers_config(random_state, options["--classifier"], skip_grid)
config = []
for classifier, scaling, grid_scoring in product(
classifiers, feature_scaling_options, grid_scoring_options):
if not classifiers[classifier].get('grid') \
and grid_scoring != grid_scoring_options[0]:
# if the classifier has no grid, we don't make configs
# for non-first grid_scoring_options, because they will produce the same results,
# because if there is no grid, grid scoring option does not make difference.
continue
classifier_scaling = classifiers[classifier].get('scaling')
if classifier_scaling is not None and scaling != classifier_scaling:
# if there is specific classifier scaling setting, use it always
# and skip other options
continue
dct = OrderedDict()
dct["classifier"] = OrderedDict((
("name", classifier),
("grid_scoring", grid_scoring),
("config", classifiers[classifier])))
if options["--classifier-filename"]:
dct["classifier"]["filename"] = options["--classifier-filename"]
dct["classes"] = OrderedDict((
("train", OrderedDict((
("names", train_classes_counts),
("total", sum(dict(train_classes_counts).values())),
("filename", options[u"<y_train.csv>"]),
("filehash", get_file_hash(options["<y_train.csv>"]))))),
("test", OrderedDict((
("names", test_classes_counts),
("total", sum(dict(test_classes_counts).values())),
("filename", options[u"<y_test.csv>"]),
("filehash", get_file_hash(options["<y_test.csv>"])))))))
dct["features"] = OrderedDict((
("scaling", scaling),
("count", len(features)),
("train_filename", options["<x_train.csv>"]),
("train_filehash", get_file_hash(options["<x_train.csv>"])),
("test_filename", options["<x_test.csv>"]),
("test_filehash", get_file_hash(options["<x_test.csv>"])),
("names", sorted(features))))
dct["verbose"] = False
dct["random_state"] = random_state
config.append(dct)
with open(options["<config.json>"], "w") as fout:
json.dump(config, fout, indent=4)
if __name__ == "__main__":
try:
make_config(docopt(__doc__))
except KeyboardInterrupt:
pass
sys.exit(0)