forked from kunalBhashkar/seq2seq_chatbot_tensorflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
180 lines (139 loc) · 9.23 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Dataset class
"""
import math
import random
import numpy as np
from os import path
class Dataset(object):
"""Class representing a chatbot dataset with questions, answers, and vocabulary.
"""
def __init__(self, questions, answers, input_vocabulary, output_vocabulary):
"""Initializes a Dataset instance with a list of questions, answers, and input/output vocabularies.
Args:
questions: Can be a list of questions as space delimited sentence(s) of words
or a list of lists of integer encoded words
answers: Can be a list of answers as space delimited sentence(s) of words
or a list of lists of integer encoded words
input_vocabulary: The Vocabulary instance to use for encoding questions
output_vocabulary: The Vocabulary instance to use for encoding answers
"""
if len(questions) != len (answers):
raise RuntimeError("questions and answers lists must be the same length, as they are lists of input-output pairs.")
self.input_vocabulary = input_vocabulary
self.output_vocabulary = output_vocabulary
#If the questions and answers are already integer encoded, accept them as is.
#Otherwise use the Vocabulary instances to encode the question and answer sequences.
if len(questions) > 0 and isinstance(questions[0], str):
self.questions_into_int = [self.input_vocabulary.words2ints(q) for q in questions]
self.answers_into_int = [self.output_vocabulary.words2ints(a) for a in answers]
else:
self.questions_into_int = questions
self.answers_into_int = answers
def size(self):
""" The size (number of samples) of the Dataset.
"""
return len(self.questions_into_int)
def train_val_split(self, val_percent = 20, random_split = True, move_samples = True):
"""Splits the dataset into training and validation sets.
Args:
val_percent: the percentage of the dataset to use as validation data.
random_split: True to split the dataset randomly.
False to split the dataset sequentially (validation samples are the last N samples, where N = samples * (val_percent / 100))
move_samples: True to physically move the samples into the returned training and validation dataset objects (saves memory).
False to copy the samples into the returned training and validation dataset objects, and preserve this dataset instance.
"""
if move_samples:
questions = self.questions_into_int
answers = self.answers_into_int
else:
questions = self.questions_into_int[:]
answers = self.answers_into_int[:]
num_validation_samples = int(len(questions) * (val_percent / 100))
num_training_samples = len(questions) - num_validation_samples
training_questions = []
training_answers = []
validation_questions = []
validation_answers = []
if random_split:
for _ in range(num_validation_samples):
random_index = random.randint(0, len(questions) - 1)
validation_questions.append(questions.pop(random_index))
validation_answers.append(answers.pop(random_index))
for _ in range(num_training_samples):
training_questions.append(questions.pop(0))
training_answers.append(answers.pop(0))
else:
for _ in range(num_training_samples):
training_questions.append(questions.pop(0))
training_answers.append(answers.pop(0))
for _ in range(num_validation_samples):
validation_questions.append(questions.pop(0))
validation_answers.append(answers.pop(0))
training_dataset = Dataset(training_questions, training_answers, self.input_vocabulary, self.output_vocabulary)
validation_dataset = Dataset(validation_questions, validation_answers, self.input_vocabulary, self.output_vocabulary)
return training_dataset, validation_dataset
def sort(self):
"""Sorts the dataset by the lengths of the questions. This can speed up training by reducing the
amount of padding the input sequences need.
"""
if self.size() > 0:
self.questions_into_int, self.answers_into_int = zip(*sorted(zip(self.questions_into_int, self.answers_into_int),
key = lambda qa_pair: len(qa_pair[0])))
def save(self, filepath):
"""Saves the dataset questions & answers exactly as represented by input_vocabulary and output_vocabulary.
"""
filename, ext = path.splitext(filepath)
questions_filepath = "{0}_questions{1}".format(filename, ext)
answers_filepath = "{0}_answers{1}".format(filename, ext)
with open(questions_filepath, mode="w", encoding="utf-8") as file:
for question_into_int in self.questions_into_int:
question = self.input_vocabulary.ints2words(question_into_int, is_punct_discrete_word = True, capitalize_i = False)
file.write(question)
file.write('\n')
with open(answers_filepath, mode="w", encoding="utf-8") as file:
for answer_into_int in self.answers_into_int:
answer = self.output_vocabulary.ints2words(answer_into_int, is_punct_discrete_word = True, capitalize_i = False)
file.write(answer)
file.write('\n')
def batches(self, batch_size):
"""Provide the dataset as an enumerable collection of batches of size batch_size.
Each batch will be a matrix of a fixed shape (batch_size, max_seq_length_in_batch).
Sequences that are shorter than the largest one are padded at the end with the PAD token.
Padding is largely just used as a placeholder since the dyamic encoder and decoder RNNs
will never see the padded timesteps.
Args:
batch_size: size of each batch.
If the total number of samples is not evenly divisible by batch_size, the last batch will contain the remainder
which will be less than batch_size.
Returns:
padded_questions_in_batch: A list of padded, integer-encoded question sequences.
padded_answers_in_batch: A list of padded, integer-encoded answer sequences.
seqlen_questions_in_batch: A list of actual sequence lengths for each question in the batch.
seqlen_answers_in_batch: A list of actual sequence lengths for each answer in the batch.
"""
for batch_index in range(0, math.ceil(len(self.questions_into_int) / batch_size)):
start_index = batch_index * batch_size
questions_in_batch = self.questions_into_int[start_index : start_index + batch_size]
answers_in_batch = self.answers_into_int[start_index : start_index + batch_size]
seqlen_questions_in_batch = np.array([len(q) for q in questions_in_batch])
seqlen_answers_in_batch = np.array([len(a) for a in answers_in_batch])
padded_questions_in_batch = np.array(self._apply_padding(questions_in_batch, self.input_vocabulary))
padded_answers_in_batch = np.array(self._apply_padding(answers_in_batch, self.output_vocabulary))
yield padded_questions_in_batch, padded_answers_in_batch, seqlen_questions_in_batch, seqlen_answers_in_batch
def _apply_padding(self, batch_of_sequences, vocabulary):
"""Padding the sequences with the <PAD> token to ensure all sequences in the batch are the same physical size.
Input and target sequences can be any length, but each batch that is fed into the model
must be defined as a fixed size matrix of shape (batch_size, max_seq_length_in_batch).
Padding allows for this dynamic sequence length within a fixed matrix.
However, the actual padded sequence timesteps are never seen by the encoder or decoder nor are they
counted toward the softmax loss. This is possible since we provide the actual sequence lengths to the model
as a separate vector of shape (batch_size), where the RNNs are instructed to only unroll for the number of
timesteps that have real (unpadded) values. The sequence loss also accepts a masking weight matrix where
we can specify that loss values for padded timesteps should be ignored.
Args:
batch_of_sequences: list of integer-encoded sequences to pad
vocabulary: Vocabulary instance to use to look up the integer encoding of the PAD token
"""
max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
return [sequence + ([vocabulary.pad_int()] * (max_sequence_length - len(sequence))) for sequence in batch_of_sequences]