-
Notifications
You must be signed in to change notification settings - Fork 72
/
VarNNEmbedVecClassification.py
259 lines (216 loc) · 10.5 KB
/
VarNNEmbedVecClassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import numpy as np
import shorttext.utils.kerasmodel_io as kerasio
import shorttext.utils.classification_exceptions as e
from shorttext.utils import tokenize
import shorttext.utils.compactmodel_io as cio
@cio.compactio({'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.h5'])
class VarNNEmbeddedVecClassifier:
"""
This is a wrapper for various neural network algorithms
for supervised short text categorization.
Each class label has a few short sentences, where each token is converted
to an embedded vector, given by a pre-trained word-embedding model (e.g., Google Word2Vec model).
The sentences are represented by a matrix, or rank-2 array.
The type of neural network has to be passed when training, and it has to be of
type :class:`keras.models.Sequential`. The number of outputs of the models has to match
the number of class labels in the training data.
To perform prediction, the input short sentences is converted to a unit vector
in the same way. The score is calculated according to the trained neural network model.
Examples of the models can be found in `frameworks`.
A pre-trained Google Word2Vec model can be downloaded `here
<https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit>`_.
Examples
>>> import shorttext
>>> # load the Word2Vec model
>>> wvmodel = shorttext.utils.load_word2vec_model('GoogleNews-vectors-negative300.bin.gz', binary=True)
>>>
>>> # load the training data
>>> trainclassdict = shorttext.data.subjectkeywords()
>>>
>>> # initialize the classifier and train
>>> kmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(classdict.keys())) # using convolutional neural network model
>>> classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
>>> classifier.train(trainclassdict, kmodel)
Epoch 1/10
45/45 [==============================] - 0s - loss: 1.0578
Epoch 2/10
45/45 [==============================] - 0s - loss: 0.5536
Epoch 3/10
45/45 [==============================] - 0s - loss: 0.3437
Epoch 4/10
45/45 [==============================] - 0s - loss: 0.2282
Epoch 5/10
45/45 [==============================] - 0s - loss: 0.1658
Epoch 6/10
45/45 [==============================] - 0s - loss: 0.1273
Epoch 7/10
45/45 [==============================] - 0s - loss: 0.1052
Epoch 8/10
45/45 [==============================] - 0s - loss: 0.0961
Epoch 9/10
45/45 [==============================] - 0s - loss: 0.0839
Epoch 10/10
45/45 [==============================] - 0s - loss: 0.0743
>>> classifier.score('artificial intelligence')
{'mathematics': 0.57749695, 'physics': 0.33749574, 'theology': 0.085007325}
"""
def __init__(self, wvmodel, vecsize=300, maxlen=15):
""" Initialize the classifier.
:param wvmodel: Word2Vec model
:param vecsize: length of the embedded vectors in the model (Default: 300)
:param maxlen: maximum number of words in a sentence (Default: 15)
:type wvmodel: gensim.models.word2vec.Word2Vec
:type vecsize: int
:type maxlen: int
"""
self.wvmodel = wvmodel
self.vecsize = vecsize
self.maxlen = maxlen
self.trained = False
def convert_trainingdata_matrix(self, classdict):
""" Convert the training data into format put into the neural networks.
Convert the training data into format put into the neural networks.
This is called by :func:`~train`.
:param classdict: training data
:return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs
:type classdict: dict
:rtype: (list, numpy.ndarray, list)
"""
classlabels = classdict.keys()
lblidx_dict = dict(zip(classlabels, range(len(classlabels))))
# tokenize the words, and determine the word length
phrases = []
indices = []
for label in classlabels:
for shorttext in classdict[label]:
shorttext = shorttext if type(shorttext)==str else ''
category_bucket = [0]*len(classlabels)
category_bucket[lblidx_dict[label]] = 1
indices.append(category_bucket)
phrases.append(tokenize(shorttext))
# store embedded vectors
train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize))
for i in range(len(phrases)):
for j in range(min(self.maxlen, len(phrases[i]))):
train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j])
indices = np.array(indices, dtype=np.int)
return classlabels, train_embedvec, indices
def train(self, classdict, kerasmodel, nb_epoch=10):
""" Train the classifier.
The training data and the corresponding keras model have to be given.
If this has not been run, or a model was not loaded by :func:`~loadmodel`,
a `ModelNotTrainedException` will be raised.
:param classdict: training data
:param kerasmodel: keras sequential model
:param nb_epoch: number of steps / epochs in training
:return: None
:type classdict: dict
:type kerasmodel: keras.models.Sequential
:type nb_epoch: int
"""
# convert classdict to training input vectors
self.classlabels, train_embedvec, indices = self.convert_trainingdata_matrix(classdict)
# train the model
kerasmodel.fit(train_embedvec, indices, epochs=nb_epoch)
# flag switch
self.model = kerasmodel
self.trained = True
def savemodel(self, nameprefix):
""" Save the trained model into files.
Given the prefix of the file paths, save the model into files, with name given by the prefix.
There will be three files produced, one name ending with "_classlabels.txt", one name
ending with ".json", and one name ending with ".h5".
If there is no trained model, a `ModelNotTrainedException` will be thrown.
:param nameprefix: prefix of the file path
:return: None
:type nameprefix: str
:raise: ModelNotTrainedException
"""
if not self.trained:
raise e.ModelNotTrainedException()
kerasio.save_model(nameprefix, self.model)
labelfile = open(nameprefix+'_classlabels.txt', 'w')
labelfile.write('\n'.join(self.classlabels))
labelfile.close()
def loadmodel(self, nameprefix):
""" Load a trained model from files.
Given the prefix of the file paths, load the model from files with name given by the prefix
followed by "_classlabels.txt", ".json", and ".h5".
If this has not been run, or a model was not trained by :func:`~train`,
a `ModelNotTrainedException` will be raised while performing prediction or saving the model.
:param nameprefix: prefix of the file path
:return: None
:type nameprefix: str
"""
self.model = kerasio.load_model(nameprefix)
labelfile = open(nameprefix+'_classlabels.txt', 'r')
self.classlabels = labelfile.readlines()
labelfile.close()
self.classlabels = map(lambda s: s.strip(), self.classlabels)
self.trained = True
def word_to_embedvec(self, word):
""" Convert the given word into an embedded vector.
Given a word, return the corresponding embedded vector according to
the word-embedding model. If there is no such word in the model,
a vector with zero values are given.
:param word: a word
:return: the corresponding embedded vector
:type word: str
:rtype: numpy.ndarray
"""
return self.wvmodel[word] if word in self.wvmodel else np.zeros(self.vecsize)
def shorttext_to_matrix(self, shorttext):
""" Convert the short text into a matrix with word-embedding representation.
Given a short sentence, it converts all the tokens into embedded vectors according to
the given word-embedding model, and put them into a matrix. If a word is not in the model,
that row will be filled with zero.
:param shorttext: a short sentence
:return: a matrix of embedded vectors that represent all the tokens in the sentence
:type shorttext: str
:rtype: numpy.ndarray
"""
tokens = tokenize(shorttext)
matrix = np.zeros((self.maxlen, self.vecsize))
for i in range(min(self.maxlen, len(tokens))):
matrix[i] = self.word_to_embedvec(tokens[i])
return matrix
def score(self, shorttext):
""" Calculate the scores for all the class labels for the given short sentence.
Given a short sentence, calculate the classification scores for all class labels,
returned as a dictionary with key being the class labels, and values being the scores.
If the short sentence is empty, or if other numerical errors occur, the score will be `numpy.nan`.
If neither :func:`~train` nor :func:`~loadmodel` was run, it will raise `ModelNotTrainedException`.
:param shorttext: a short sentence
:return: a dictionary with keys being the class labels, and values being the corresponding classification scores
:type shorttext: str
:rtype: dict
:raise: ModelNotTrainedException
"""
if not self.trained:
raise e.ModelNotTrainedException()
# retrieve vector
matrix = np.array([self.shorttext_to_matrix(shorttext)])
# classification using the neural network
predictions = self.model.predict(matrix)
# wrangle output result
scoredict = {}
for idx, classlabel in zip(range(len(self.classlabels)), self.classlabels):
scoredict[classlabel] = predictions[0][idx]
return scoredict
def load_varnnlibvec_classifier(wvmodel, name, compact=True):
""" Load a :class:`shorttext.classifiers.VarNNEmbeddedVecClassifier` instance from file, given the pre-trained Word2Vec model.
:param wvmodel: Word2Vec model
:param name: name (if compact=True) or prefix (if compact=False) of the file path
:param compact whether model file is compact (Default: True)
:return: the classifier
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type name: str
:type compact: bool
:rtype: VarNNEmbeddedVecClassifier
"""
classifier = VarNNEmbeddedVecClassifier(wvmodel)
if compact:
classifier.load_compact_model(name)
else:
classifier.loadmodel(name)
return classifier