Merge pull request #14 to fix inference issue

Small hacks to fix #13 due to lack of tags in the inference stage
WING-NUS · Jan 4, 2019 · dca0646 · dca0646
2 parents 188721d + 9f59121
commit dca0646
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 4 deletions.
diff --git a/loader.py b/loader.py
@@ -148,7 +148,13 @@ def f(x):
         chars = [[char_to_id[c] for c in w if c in char_to_id]
                  for w in str_words]
         caps = [cap_feature(w) for w in str_words]
-        tags = [tag_to_id[w[-1]] for w in s]
+
+        # Hack: This is for an inference stage where tag_to_id is not necessary
+        if tag_to_id:
+            tags = [tag_to_id[w[-1]] for w in s]
+        else:
+            tags = tag_to_id
+
         data.append({
             'str_words': str_words,
             'words': words,

diff --git a/run.py b/run.py
@@ -68,7 +68,7 @@
         file.write('\n'.join(string.split()) + '\n')
     file.close()
     test_sentences = load_sentences(test_file, lower, zeros)
-    data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower, True)
+    data = prepare_dataset(test_sentences, word_to_id, char_to_id, {}, lower, True)
 
     for citation in data:
         inputs = create_input(citation, model.parameters, False)

diff --git a/train.py b/train.py
@@ -206,7 +206,7 @@
 # Train network
 #
 singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1])
-n_epochs = 10  # number of epochs over the training set
+n_epochs = 1  # number of epochs over the training set
 freq_eval = 1000  # evaluate on dev every freq_eval steps
 best_dev = -np.inf
 best_test = -np.inf

diff --git a/utils.py b/utils.py
@@ -4,7 +4,6 @@
 import codecs
 import numpy as np
 import theano
-from sklearn import metrics
 
 models_path = "./models"
 eval_path = "./evaluation"
@@ -223,6 +222,8 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
     """
     Evaluate current model using CoNLL script.
     """
+    # Make sklearn import at runtime only
+    from sklearn import metrics
     results = {'real': [], 'predicted': []}
 
     for _, data in zip(raw_sentences, parsed_sentences):