-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlearn.py
89 lines (68 loc) · 2.71 KB
/
learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import scipy as sp
import os, signals
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
#Check if the module is executed as main, needed for parallel processing
if __name__ == '__main__':
#List of parameters
SHOW_CONFUSION_MATRIX = False
x_data = []
y_data = []
classes = {}
root="data" #Default directory containing the dataset
print "Loading the dataset from '{directory}'...".format(directory=root),
#Fetch all the data files from the root directory of the dataset
for path, subdirs, files in os.walk(root):
for name in files:
#Get the filename
filename = os.path.join(path, name)
#Load the sample from file
sample = signals.Sample.load_from_file(filename)
#Linearize the sample and then add it to the x_data list
x_data.append(sample.get_linearized())
#Extract the category from the file name
#For example, the file "a_sample_0.txt" will be considered as "a"
category = name.split("_")[0]
#Get a number for the category, as an offset from the category
#to the a char in Ascii
number = ord(category) - ord("a")
#Add the category to the y_data list
y_data.append(number)
#Include the category and the corresponding number into a dictionary
#for easy access and referencing
classes[number] = category
print "DONE"
#Parameters used in the cross-validated training process
#The library automatically tries every possible combination to
#find the best scoring one.
params = {'C':[0.001,0.01,0.1,1], 'kernel':['linear']}
#Inizialize the model
svc = svm.SVC(probability = True)
#Inizialize the GridSearchCV with 8 processing cores and maximum verbosity
clf = GridSearchCV(svc, params,verbose =10, n_jobs=8)
#Split the dataset into two subset, one used for training and one for testing
X_train, X_test, Y_train, Y_test = train_test_split(x_data,
y_data, test_size=0.35, random_state=0)
print "Starting the training process..."
#Start the training process
clf.fit(X_train, Y_train)
#If SHOW_CONFUSION_MATRIX is true, prints the confusion matrix
if SHOW_CONFUSION_MATRIX:
print "Confusion Matrix:"
Y_predicted = clf.predict(X_test)
print confusion_matrix(Y_test, Y_predicted)
print "\nBest estimator parameters: "
print clf.best_estimator_
#Calculates the score of the best estimator found.
score = clf.score(X_test, Y_test)
print "\nSCORE: {score}\n".format(score = score)
print "Saving the model...",
#Saves the model to the "model.pkl" file
joblib.dump(clf, 'model.pkl')
#Saves the classes to the "classes.pkl" file
joblib.dump(classes, 'classes.pkl')
print "DONE"