Skip to content

Commit

Permalink
implements baseline
Browse files Browse the repository at this point in the history
  • Loading branch information
icoxfog417 committed Jun 8, 2017
1 parent 30fc80d commit 7b7525d
Show file tree
Hide file tree
Showing 11 changed files with 302 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,6 @@ ENV/

# mypy
.mypy_cache/

data/
.DS_Store
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# tying-wv-and-wc
Implementation for "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling"

Implementation for "[Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462)"

Empty file added data/.gitkeep
Empty file.
Empty file added model/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions model/data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import chazutsu
import numpy as np


class DataProcessor():

def __init__(self):
pass

def get_ptb(self, data_root, vocab_size=10000, force=False):
r = chazutsu.datasets.PTB().download(directory=data_root)
r_idx = r.to_indexed().make_vocab(vocab_size=vocab_size, force=force)
return r_idx

def format(self, word_seq, vocab_size, sentence_size=35, skip=3):
sentences = []
next_words = []
index = 0
for i in range(0, len(word_seq) - sentence_size, skip):
sentences.append(word_seq[i:i + sentence_size])
nw = word_seq[i + sentence_size]
next_words.append(nw)

sentences = np.array(sentences)
one_hots = np.zeros((len(next_words), vocab_size))
for i, nw in enumerate(next_words):
one_hots[i][nw] = 1

return sentences, one_hots
68 changes: 68 additions & 0 deletions model/lang_model_sgd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import copy
from keras import backend as K
from keras.optimizers import Optimizer
import numpy as np
import tensorflow as tf
from model.settings import SizeSetting, DatasetSetting


class LangModelSGD(Optimizer):

def __init__(self, size_kind="small", dataset_kind="ptb"):
size_setting = SizeSetting.get(size_kind)
dset_setting = DatasetSetting.get(dataset_kind)
super(LangModelSGD, self).__init__()

self.iterations = K.variable(0.)
self.epoch_interval = K.variable(size_setting["epoch_interval"])
self.lr = K.variable(1.0)
self.decay = K.variable(size_setting["decay"])
self._clipnorm = size_setting["norm_clipping"]

def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
grads = [clip_norm(g, self._clipnorm, norm) for g in grads]
if self.iterations % self.epoch_interval == 0:
self.lr = self.lr * self.decay

self.updates = [(self.iterations, self.iterations + 1.)]
for p, g in zip(params, grads):
self.updates.append((p, p - self.lr * g))
return self.updates

def get_config(self):
config = {"lr": float(K.get_value(self.lr)),
"decay": float(K.get_value(self.decay)),
"epoch_interval": float(K.get_value(self.epoch_interval))
}
base_config = super(LangModelSGD, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

def get_lr(self):
return self.lr.eval()


# because of https://github.com/fchollet/keras/pull/6859

def clip_norm(g, c, n):
if c > 0:
condition = n >= c
then_expression = tf.scalar_mul(c / n, g)
else_expression = g

if isinstance(then_expression, tf.Tensor):
g_shape = copy.copy(then_expression.get_shape())
elif isinstance(then_expression, tf.IndexedSlices):
g_shape = copy.copy(then_expression.dense_shape)
if condition.dtype != tf.bool:
condition = tf.cast(condition, "bool")
g = tf.cond(condition,
lambda: then_expression,
lambda: else_expression)
if isinstance(then_expression, tf.Tensor):
g.set_shape(g_shape)
elif isinstance(then_expression, tf.IndexedSlices):
g._dense_shape = g_shape

return g
65 changes: 65 additions & 0 deletions model/one_hot_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Activation, Dropout
from model.lang_model_sgd import LangModelSGD
from model.settings import DatasetSetting


class OneHotModel():

def __init__(self,
vocab_size,
sentence_size,
network_size="small",
dataset_kind="ptb"):

self.network_size = network_size
self.dataset_kind = dataset_kind
self.vocab_size = vocab_size
self.sentence_size = sentence_size
self.vector_length = self.get_vector_length(network_size)

dset_setting = DatasetSetting.get(dataset_kind)
dropout = dset_setting["dropout"][network_size]

embedding = Embedding(self.vocab_size, self.vector_length, input_length=sentence_size)
layer1 = LSTM(self.vector_length, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)
layer2 = LSTM(self.vector_length, return_sequences=False, dropout=dropout, recurrent_dropout=dropout)
projection = Dense(self.vocab_size, activation="softmax")
self.model = Sequential()
self.model.add(embedding)
self.model.add(layer1)
self.model.add(layer2)
self.model.add(projection)

def get_vector_length(self, network_size):
if network_size == "small":
return 200
elif network_size == "medium":
return 650
elif network_size == "large":
return 1500
else:
return 200

def compile(self):
sgd = LangModelSGD(self.network_size, self.dataset_kind)
self.model.compile(
loss="categorical_crossentropy",
optimizer=sgd
)

def fit(self, x_train, y_train, x_test, y_test, batch_size=32, epochs=20):
self.model.fit(
x_train, y_train,
batch_size=batch_size,
epochs=epochs
)

def predict(self, words):
x = np.zeros((1, self.sentence_size))
for i, w in enumerate(words):
x[0][i] = w
pred = self.model.predict(x)[0]
return pred

51 changes: 51 additions & 0 deletions model/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
class SizeSetting():

@classmethod
def get(cls, kind):
if kind == "small":
return {
"epoch_interval": 5,
"decay": 0.9,
"norm_clipping": 5
}
elif kind == "medium":
return {
"epoch_interval": 10,
"decay": 0.9,
"norm_clipping": 5
}
elif kind == "large":
return {
"epoch_interval": 1,
"decay": 0.97,
"norm_clipping": 6
}
else:
raise Exception("You have to choose size from small, medium, large")


class DatasetSetting():

@classmethod
def get(cls, kind):
if kind == "ptb":
return {
"dropout": {
"small": 0.7,
"medium": 0.5,
"large": 0.35
},
"gamma": 0.65
}
elif kind == "wiki2":
return {
"dropout": {
"small": 0.8,
"medium": 0.6,
"large": 0.6
},
"gamma": 1.25
}
else:
raise Exception("You have to choose dataset from ptb, wiki2")

33 changes: 33 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import numpy as np
from model.one_hot_model import OneHotModel
from model.data_processor import DataProcessor


DATA_ROOT = os.path.join(os.path.dirname(__file__), "data")


def flatten(data):
flatted = []
for a in data.values.flatten():
flatted += a
return np.array(flatted)

def run_ptb(network_size="small"):
# prepare the data
dataset_kind = "ptb"
dp = DataProcessor()
ptb = dp.get_ptb(DATA_ROOT, vocab_size=10000)
vocab_size = len(ptb.vocab_data())
sentence_size = 35
x_train, y_train = dp.format(flatten(ptb.train_data()), vocab_size, sentence_size)
x_valid, y_valid = dp.format(flatten(ptb.valid_data()), vocab_size, sentence_size)

# make one hot model
model = OneHotModel(vocab_size, sentence_size, network_size, dataset_kind)
model.compile()
model.fit(x_valid, y_valid, x_valid, y_valid, epochs=1)


if __name__ == "__main__":
run_ptb()
20 changes: 20 additions & 0 deletions tests/test_data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import unittest
import numpy as np
from model.data_processor import DataProcessor


class TestDataProcessor(unittest.TestCase):

def test_format(self):
dp = DataProcessor()
samples = np.array(range(10))
x, y = dp.format(samples, 5, 3)
self.assertEqual(x.shape, (7, 3))
self.assertEqual(y.shape, (7, 5))


if __name__ == "__main__":
unittest.main()
30 changes: 30 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
import unittest
import numpy as np
from model.data_processor import DataProcessor
from model.one_hot_model import OneHotModel


class TestModel(unittest.TestCase):

def test_one_hot_forward(self):
vocab_size = 10
sentence_size = 20

dp = DataProcessor()
samples = np.array(np.random.randint(vocab_size, size=100))
x, y = dp.format(samples, vocab_size, sentence_size)
samples = np.array(np.random.randint(vocab_size, size=100))
x_t, y_t = dp.format(samples, vocab_size, sentence_size)

model = OneHotModel(vocab_size, sentence_size)
model.compile()
model.fit(x, y, x_t, y_t, epochs=1)
pred = model.predict(np.array([0,1,2]))
print(pred)


if __name__ == "__main__":
unittest.main()

0 comments on commit 7b7525d

Please sign in to comment.