port to tensorflow v1+ and sst binary demo

openai · Jul 12, 2017 · 83d940d · 83d940d
1 parent 8ce0a73
commit 83d940d
Show file tree

Hide file tree

Showing 6 changed files with 9,679 additions and 7 deletions.
diff --git a/data/dev_binary_sent.csv b/data/dev_binary_sent.csv
diff --git a/data/test_binary_sent.csv b/data/test_binary_sent.csv
diff --git a/data/train_binary_sent.csv b/data/train_binary_sent.csv
diff --git a/encoder.py b/encoder.py
@@ -65,7 +65,7 @@ def mlstm(inputs, c, h, M, ndim, scope='lstm', wn=False):
     for idx, x in enumerate(inputs):
         m = tf.matmul(x, wmx)*tf.matmul(h, wmh)
         z = tf.matmul(x, wx) + tf.matmul(m, wh) + b
-        i, f, o, u = tf.split(1, 4, z)
+        i, f, o, u = tf.split(z, 4, 1)
         i = tf.nn.sigmoid(i)
         f = tf.nn.sigmoid(f)
         o = tf.nn.sigmoid(o)
@@ -81,22 +81,22 @@ def mlstm(inputs, c, h, M, ndim, scope='lstm', wn=False):
             h = o*tf.tanh(c)
         inputs[idx] = h
         cs.append(c)
-    cs = tf.pack(cs)
+    cs = tf.stack(cs)
     return inputs, cs, c, h
 
 
 def model(X, S, M=None, reuse=False):
     nsteps = X.get_shape()[1]
-    cstart, hstart = tf.unpack(S, num=hps.nstates)
+    cstart, hstart = tf.unstack(S, num=hps.nstates)
     with tf.variable_scope('model', reuse=reuse):
         words = embd(X, hps.nembd)
-        inputs = [tf.squeeze(v, [1]) for v in tf.split(1, nsteps, words)]
+        inputs = tf.unstack(words, nsteps, 1)
         hs, cells, cfinal, hfinal = mlstm(
             inputs, cstart, hstart, M, hps.nhidden, scope='rnn', wn=hps.rnn_wn)
-        hs = tf.reshape(tf.concat(1, hs), [-1, hps.nhidden])
+        hs = tf.reshape(tf.concat(hs, 1), [-1, hps.nhidden])
         logits = fc(
             hs, hps.nvocab, act=lambda x: x, wn=hps.out_wn, scope='out')
-    states = tf.pack([cfinal, hfinal], 0)
+    states = tf.stack([cfinal, hfinal], 0)
     return cells, states, logits
 
 
@@ -143,7 +143,7 @@ def __init__(self, nbatch=128, nsteps=64):
         cells, states, logits = model(X, S, M, reuse=False)
 
         sess = tf.Session()
-        tf.initialize_all_variables().run(session=sess)
+        tf.global_variables_initializer().run(session=sess)
 
         def seq_rep(xmb, mmb, smb):
             return sess.run(states, {X: xmb, M: mmb, S: smb})

diff --git a/sst_binary_demo.py b/sst_binary_demo.py
@@ -0,0 +1,14 @@
+from encoder import Model
+from utils import sst_binary, train_with_reg_cv
+
+model = Model()
+
+trX, vaX, teX, trY, vaY, teY = sst_binary()
+trXt = model.transform(trX)
+vaXt = model.transform(vaX)
+teXt = model.transform(teX)
+
+full_rep_acc, c, nnotzero = train_with_reg_cv(trXt, trY, vaXt, vaY, teXt, teY)
+print('%05.2f test accuracy'%full_rep_acc)
+print('%05.2f regularization coef'%c)
+print('%05d features used'%nnotzero)
diff --git a/utils.py b/utils.py
@@ -1,5 +1,47 @@
+import os
 import html
+import numpy as np
+import pandas as pd
 import tensorflow as tf
+from sklearn.linear_model import LogisticRegression
+
+def train_with_reg_cv(trX, trY, vaX, vaY, teX=None, teY=None, penalty='l1',
+        C=2**np.arange(-8, 1).astype(np.float), seed=42):
+    scores = []
+    for i, c in enumerate(C):
+        model = LogisticRegression(C=c, penalty=penalty, random_state=seed+i)
+        model.fit(trX, trY)
+        score = model.score(vaX, vaY)
+        scores.append(score)
+    c = C[np.argmax(scores)]
+    model = LogisticRegression(C=c, penalty=penalty, random_state=seed+len(C))
+    model.fit(trX, trY)
+    nnotzero = np.sum(model.coef_ != 0)
+    if teX is not None and teY is not None:
+        score = model.score(teX, teY)*100.
+    else:
+        score = model.score(vaX, vaY)*100.
+    return score, c, nnotzero
+
+
+def load_sst(path):
+    data = pd.read_csv(path)
+    X = data['sentence'].values.tolist()
+    Y = data['label'].values
+    return X, Y
+
+
+def sst_binary(data_dir='data/'):
+    """
+    Most standard models make use of a preprocessed/tokenized/lowercased version
+    of Stanford Sentiment Treebank. Our model extracts features from a version
+    of the dataset using the raw text instead which we've included in the data
+    folder.
+    """
+    trX, trY = load_sst(os.path.join(data_dir, 'train_binary_sent.csv'))
+    vaX, vaY = load_sst(os.path.join(data_dir, 'dev_binary_sent.csv'))
+    teX, teY = load_sst(os.path.join(data_dir, 'test_binary_sent.csv'))
+    return trX, vaX, teX, trY, vaY, teY
 
 
 def find_trainable_variables(key):