-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy pathseq2seq_utils.py
440 lines (349 loc) · 15.3 KB
/
seq2seq_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
from matplotlib import pyplot as plt
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot
import logging
import numpy as np
import dill as dpickle
from annoy import AnnoyIndex
from tqdm import tqdm, tqdm_notebook
from random import random
from nltk.translate.bleu_score import corpus_bleu
def build_seq2seq_model(word_emb_dim,
hidden_state_dim,
encoder_seq_len,
num_encoder_tokens,
num_decoder_tokens):
"""
Builds architecture for sequence to sequence model.
Encoder and Decoder layer consist of one GRU Layer each. User
can specify the dimensionality of the word embedding and the hidden state.
Parameters
----------
word_emb_dim : int
dimensionality of the word embeddings
hidden_state_dim : int
dimensionality of the hidden state in the encoder and decoder.
encoder_seq_len : int
the length of the sequences that are input into the encoder. The
sequences are expected to all be padded to the same size.
num_encoder_tokens : int
the vocabulary size of the corpus relevant to the encoder.
num_decoder_tokens : int
the vocabulary size of the corpus relevant to the decoder.
Returns
-------
Keras.models.Model
"""
#### Encoder Model ####
encoder_inputs = Input(shape=(encoder_seq_len,), name='Encoder-Input')
# Word embeding for encoder (ex: Issue Titles, Code)
x = Embedding(num_encoder_tokens, word_emb_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)
# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(hidden_state_dim, return_state=True, name='Encoder-Last-GRU', dropout=.5)(x)
# Encapsulate the encoder as a separate entity so we can just
# encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input') # for teacher forcing
# Word Embedding For Decoder (ex: Issue Titles, Docstrings)
dec_emb = Embedding(num_decoder_tokens, word_emb_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(hidden_state_dim, return_state=True, return_sequences=True, name='Decoder-GRU', dropout=.5)
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)
# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)
#### Seq2Seq Model ####
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
return seq2seq_Model
def load_text_processor(fname='title_pp.dpkl'):
"""
Load preprocessors from disk.
Parameters
----------
fname: str
file name of ktext.proccessor object
Returns
-------
num_tokens : int
size of vocabulary loaded into ktext.processor
pp : ktext.processor
the processor you are trying to load
Typical Usage:
-------------
num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
"""
# Load files from disk
with open(fname, 'rb') as f:
pp = dpickle.load(f)
num_tokens = max(pp.id2token.keys()) + 1
print(f'Size of vocabulary for {fname}: {num_tokens:,}')
return num_tokens, pp
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
"""
Load decoder inputs.
Parameters
----------
decoder_np_vecs : str
filename of serialized numpy.array of decoder input (issue title)
Returns
-------
decoder_input_data : numpy.array
The data fed to the decoder as input during training for teacher forcing.
This is the same as `decoder_np_vecs` except the last position.
decoder_target_data : numpy.array
The data that the decoder data is trained to generate (issue title).
Calculated by sliding `decoder_np_vecs` one position forward.
"""
vectorized_title = np.load(decoder_np_vecs)
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_title[:, :-1]
# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_title[:, 1:]
print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')
return decoder_input_data, decoder_target_data
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
"""
Load variables & data that are inputs to encoder.
Parameters
----------
encoder_np_vecs : str
filename of serialized numpy.array of encoder input (issue title)
Returns
-------
encoder_input_data : numpy.array
The issue body
doc_length : int
The standard document length of the input for the encoder after padding
the shape of this array will be (num_examples, doc_length)
"""
vectorized_body = np.load(encoder_np_vecs)
# Encoder input is simply the body of the issue text
encoder_input_data = vectorized_body
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')
return encoder_input_data, doc_length
def viz_model_architecture(model):
"""Visualize model architecture in Jupyter notebook."""
display(SVG(model_to_dot(model).create(prog='dot', format='svg')))
def free_gpu_mem():
"""Attempt to free gpu memory."""
K.get_session().close()
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))
def test_gpu():
"""Run a toy computation task in tensorflow to test GPU."""
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
hello = tf.constant('Hello, TensorFlow!')
print(session.run(hello))
def plot_model_training_history(history_object):
"""Plots model train vs. validation loss."""
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.plot(history_object.history['loss'])
plt.plot(history_object.history['val_loss'])
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def extract_encoder_model(model):
"""
Extract the encoder from the original Sequence to Sequence Model.
Returns a keras model object that has one input (body of issue) and one
output (encoding of issue, which is the last hidden state).
Input:
-----
model: keras model object
Returns:
-----
keras model object
"""
encoder_model = model.get_layer('Encoder-Model')
return encoder_model
def extract_decoder_model(model):
"""
Extract the decoder from the original model.
Inputs:
------
model: keras model object
Returns:
-------
A Keras model object with the following inputs and outputs:
Inputs of Keras Model That Is Returned:
1: the embedding index for the last predicted word or the <Start> indicator
2: the last hidden state, or in the case of the first word the hidden state from the encoder
Outputs of Keras Model That Is Returned:
1. Prediction (class probabilities) for the next word
2. The hidden state of the decoder, to be fed back into the decoder at the next time step
Implementation Notes:
----------------------
Must extract relevant layers and reconstruct part of the computation graph
to allow for different inputs as we are not going to use teacher forcing at
inference time.
"""
# the latent dimension is the dmeinsion of the hidden state passed from the encoder to the decoder.
latent_dim = model.get_layer('Encoder-Model').output_shape[-1]
# Reconstruct the input into the decoder
decoder_inputs = model.get_layer('Decoder-Input').input
dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)
# Instead of setting the intial state from the encoder and forgetting about it, during inference
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
# the GRU, thus we define this input layer for the state so we can add this capability
gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')
# we need to reuse the weights that is why we are getting this
# If you inspect the decoder GRU that we created for training, it will take as input
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
# (which will now be the last step's prediction, and will be _start_ on the first time step)
# (2) is the state, which we will initialize with the encoder on the first time step, but then
# grab the state after the first prediction and feed that back in again.
gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
# Reconstruct dense layers
dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
decoder_model = Model([decoder_inputs, gru_inference_state_input],
[dense_out, gru_state_out])
return decoder_model
class Seq2Seq_Inference(object):
def __init__(self,
encoder_preprocessor,
decoder_preprocessor,
seq2seq_model):
self.enc_pp = encoder_preprocessor
self.dec_pp = decoder_preprocessor
self.seq2seq_model = seq2seq_model
self.encoder_model = extract_encoder_model(seq2seq_model)
self.decoder_model = extract_decoder_model(seq2seq_model)
self.default_max_len = self.dec_pp.padding_maxlen
self.nn = None
self.rec_df = None
def predict(self,
raw_input_text,
max_len=None):
"""
Use the seq2seq model to generate a output given the input.
Inputs
------
raw_input: str
The body of what is to be summarized or translated.
max_len: int (optional)
The maximum length of the output
"""
if max_len is None:
max_len = self.default_max_len
# get the encoder's features for the decoder
raw_tokenized = self.enc_pp.transform([raw_input_text])
encoding = self.encoder_model.predict(raw_tokenized)
# we want to save the encoder's embedding before its updated by decoder
# because we can use that as an embedding for other tasks.
original_encoding = encoding
state_value = np.array(self.dec_pp.token2id['_start_']).reshape(1, 1)
decoded_sentence = []
stop_condition = False
while not stop_condition:
preds, st = self.decoder_model.predict([state_value, encoding])
# We are going to ignore indices 0 (padding) and indices 1 (unknown)
# Argmax will return the integer index corresponding to the
# prediction + 2 b/c we chopped off first two
pred_idx = np.argmax(preds[:, :, 2:]) + 2
# retrieve word from index prediction
pred_word_str = self.dec_pp.id2token[pred_idx]
if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
stop_condition = True
break
decoded_sentence.append(pred_word_str)
# update the decoder for the next word
encoding = st
state_value = np.array(pred_idx).reshape(1, 1)
return original_encoding, ' '.join(decoded_sentence)
def print_example(self,
i,
input_text,
output_text,
url,
threshold):
"""
Prints an example of the model's prediction for manual inspection.
"""
if i:
print('\n\n==============================================')
print(f'============== Example # {i} =================\n')
if url:
print(url)
print(f"Original Input:\n {input_text} \n")
if output_text:
print(f"Original Output:\n {output_text}")
emb, gen_title = self.predict(input_text)
print(f"\n****** Predicted Output ******:\n {gen_title}")
def demo_model_predictions(self,
n,
df,
threshold=1,
input_col='code',
output_col='comment',
ref_col='ref'):
"""
Pick n random Issues and display predictions.
Input:
------
n : int
Number of examples to display from
df : pandas DataFrame
threshold : float
distance threshold for recommendation of similar issues.
Returns:
--------
None
Prints the original issue body and the model's prediction.
"""
# Extract input and output from DF
input_text = df[input_col].tolist()
output_text = df[output_col].tolist()
url = df[ref_col].tolist()
demo_list = np.random.randint(low=1, high=len(input_text), size=n)
for i in demo_list:
self.print_example(i,
input_text=input_text[i],
output_text=output_text[i],
url=url[i],
threshold=threshold)
def evaluate_model(self, input_strings, output_strings, max_len):
"""
Method for calculating BLEU Score.
Parameters
----------
input_strings : List[str]
These are the issue bodies that we want to summarize
output_strings : List[str]
This is the ground truth we are trying to predict --> issue titles
Returns
-------
bleu : float
The BLEU Score
"""
self.actual, self.predicted = list(), list()
assert len(input_strings) == len(output_strings)
num_examples = len(input_strings)
logging.warning('Generating predictions.')
# step over the whole set TODO: parallelize this
for i in tqdm_notebook(range(num_examples)):
_, yhat = self.predict(input_strings[i], max_len)
self.actual.append(self.dec_pp.process_text([output_strings[i]])[0])
self.predicted.append(self.dec_pp.process_text([yhat])[0])
# calculate BLEU score
logging.warning('Calculating BLEU.')
bleu = corpus_bleu([[a] for a in self.actual], self.predicted)
return bleu