From a815c8473ee3ddc68a253131067121a273e35488 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 8 Aug 2017 13:39:53 +0530 Subject: [PATCH 01/32] added initial code for CBOW --- gensim/models/fasttext.py | 244 +++++++++++++++++++++++++++++++++++ gensim/test/test_fasttext.py | 111 ++++++++++++++++ 2 files changed, 355 insertions(+) create mode 100644 gensim/models/fasttext.py create mode 100644 gensim/test/test_fasttext.py diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py new file mode 100644 index 0000000000..4afa4e87e9 --- /dev/null +++ b/gensim/models/fasttext.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging + +from types import GeneratorType +from copy import deepcopy +from six import string_types +import numpy as np +from numpy import dot, zeros, ones, vstack, outer, random, sum as np_sum, empty, float32 as REAL +from scipy.special import expit + +from gensim.utils import call_on_class_only +from gensim.models.word2vec import Word2Vec +from gensim.models.wrappers.fasttext import FastTextKeyedVectors +from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper + +logger = logging.getLogger(__name__) + +MAX_WORDS_IN_BATCH = 10000 + + +def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): + result = 0 + for sentence in sentences: + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + for pos, word in enumerate(word_vocabs): + reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + start = max(0, pos - model.window + reduced_window) + window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) + word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] + + word2_subwords = [] + + for indices in word2_indices: + word2_subwords += ['<' + model.wv.index2word[indices] + '>'] + word2_subwords += Ft_Wrapper.compute_ngrams(model.wv.index2word[indices], model.min_n, model.max_n) + word2_subwords = list(set(word2_subwords)) + + subwords_indices = [] + for subword in word2_subwords: + subwords_indices.append(model.wv.ngrams[subword]) + + l1 = np_sum(model.wv.syn0_all[subwords_indices], axis=0) # 1 x vector_size + if subwords_indices and model.cbow_mean: + l1 /= len(subwords_indices) + + train_cbow_pair(model, word, subwords_indices, l1, alpha) # train on the sliding window for target word + result += len(word_vocabs) + return result + +def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors=True, learn_hidden=True): + neu1e = zeros(l1.shape) + + if model.hs: + l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size + fa = expit(dot(l1, l2a.T)) # propagate hidden -> output + ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate + if learn_hidden: + model.syn1[word.point] += outer(ga, l1) # learn hidden -> output + neu1e += dot(ga, l2a) # save error + + if model.negative: + # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) + word_indices = [word.index] # through word index get all subwords indices (need to make the changes in code) + while len(word_indices) < model.negative + 1: + w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) + if w != word.index: + word_indices.append(w) + l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size + fb = expit(dot(l1, l2b.T)) # propagate hidden -> output + gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate + if learn_hidden: + model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output + neu1e += dot(gb, l2b) # save error + + if learn_vectors: + # learn input -> hidden, here for all words in the window separately + if not model.cbow_mean and input_subword_indices: + neu1e /= len(input_subword_indices) + for i in input_subword_indices: + model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] + + return neu1e + + +class FastText(Word2Vec): + def __init__( + self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, word_ngrams=1, loss='ns', sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, + trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): + + self.load = call_on_class_only + + self.initialize_ngram_vectors() + + self.sg = int(sg) + self.cum_table = None # for negative sampling + self.vector_size = int(size) + self.layer1_size = int(size) + if size % 4 != 0: + logger.warning("consider setting layer size to a multiple of 4 for greater performance") + self.alpha = float(alpha) + self.min_alpha_yet_reached = float(alpha) # To warn user if alpha increases + self.window = int(window) + self.max_vocab_size = max_vocab_size + self.seed = seed + self.random = random.RandomState(seed) + self.min_count = min_count + self.sample = sample + self.workers = int(workers) + self.min_alpha = float(min_alpha) + self.hs = hs + self.negative = negative + self.cbow_mean = int(cbow_mean) + self.hashfxn = hashfxn + self.iter = iter + self.null_word = null_word + self.train_count = 0 + self.total_train_time = 0 + self.sorted_vocab = sorted_vocab + self.batch_words = batch_words + self.model_trimmed_post_training = False + + self.bucket = bucket + self.loss = loss # should we keep this? -> we already have `hs`, `negative` -> although we don't have a mode for only `softmax` + self.word_ngrams = word_ngrams + self.min_n = min_n + self.max_n = max_n + if self.word_ngrams <= 1 and self.max_n == 0: + self.bucket = 0 + + self.wv.min_n = min_n + self.wv.max_n = max_n + + if sentences is not None: + if isinstance(sentences, GeneratorType): + raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") + self.build_vocab(sentences, trim_rule=trim_rule) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha) + else: + if trim_rule is not None: + logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") + logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") + + def train(self, sentences, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, + word_count=0, queue_factor=2, report_delay=1.0): + self.neg_labels = [] + if self.negative > 0: + # precompute negative labels optimization for pure-python training + self.neg_labels = zeros(self.negative + 1) + self.neg_labels[0] = 1. + + Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha) + self.get_vocab_word_vecs() + + def initialize_ngram_vectors(self): + self.wv = FastTextKeyedVectors() + + def __getitem__(self, word): + return self.word_vec(word) + + def get_vocab_word_vecs(self): + for w, v in self.wv.vocab.items(): + word_vec = np.zeros(self.wv.syn0_all.shape[1]) + ngrams = ['<' + w + '>'] + ngrams += Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + ngrams = list(set(ngrams)) + ngram_weights = self.wv.syn0_all + for ngram in ngrams: + word_vec += ngram_weights[self.wv.ngrams[ngram]] + word_vec /= len(ngrams) + + self.wv.syn0[v.index] = word_vec + + def word_vec(self, word, use_norm=False): + if word in self.wv.vocab: + if use_norm: + return self.wv.syn0norm[self.wv.vocab[word].index] + else: + return self.wv.syn0[self.wv.vocab[word].index] + else: + logger.info("out of vocab") + word_vec = np.zeros(self.wv.syn0_all.shape[1]) + ngrams = Ft_Wrapper.compute_ngrams(word, self.min_n, self.max_n) + ngrams = [ng for ng in ngrams if ng in self.wv.ngrams] + if use_norm: + ngram_weights = self.wv.syn0_all_norm + else: + ngram_weights = self.wv.syn0_all + for ngram in ngrams: + word_vec += ngram_weights[self.wv.ngrams[ngram]] + if word_vec.any(): + return word_vec / len(ngrams) + else: # No ngrams of the word are present in self.ngrams + raise KeyError('all ngrams for word %s absent from model' % word) + + def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): + self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.finalize_vocab(update=update) # build tables & arrays + # super(build_vocab, self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False) + self.init_ngrams() + + def reset_ngram_weights(self): + for ngram in self.wv.ngrams: + self.wv.syn0_all[self.wv.ngrams[ngram]] = self.seeded_vector(ngram + str(self.seed)) + + def init_ngrams(self): + self.wv.ngrams = {} + self.wv.syn0_all = empty((self.bucket + len(self.wv.vocab), self.vector_size), dtype=REAL) + self.syn0_all_lockf = ones((self.bucket + len(self.wv.vocab), self.vector_size), dtype=REAL) + + all_ngrams = [] + for w, v in self.wv.vocab.items(): + all_ngrams += ['<' + w + '>'] + all_ngrams += Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + all_ngrams = list(set(all_ngrams)) + self.num_ngram_vectors = len(all_ngrams) + logger.info("Total number of ngrams in the vocab is %d", self.num_ngram_vectors) + + ngram_indices = [] + for i, ngram in enumerate(all_ngrams): + ngram_hash = Ft_Wrapper.ft_hash(ngram) + ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) + self.wv.ngrams[ngram] = i + + self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) + self.reset_ngram_weights() + + def _do_train_job(self, sentences, alpha, inits): + work, neu1 = inits + tally = 0 + # if self.sg: + # tally += train_batch_sg(self, sentences, alpha, work) + # else: + tally += train_batch_cbow(self, sentences, alpha, work, neu1) + + return tally, self._raw_word_count(sentences) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py new file mode 100644 index 0000000000..56da1bbdd8 --- /dev/null +++ b/gensim/test/test_fasttext.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging +import unittest +import os + +import numpy as np + +from gensim import utils +from gensim.models.fasttext import FastText as FT_gensim +from gensim.models.wrappers.fasttext import FastText as FT_wrapper + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + +class LeeCorpus(object): + def __iter__(self): + with open(datapath('lee_background.cor')) as f: + for line in f: + yield utils.simple_preprocess(line) + +list_corpus = list(LeeCorpus()) + +sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'] +] + + +class TestFastTextModel(unittest.TestCase): + + def models_equal(self, model, model2): + self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) + self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) + self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) + if model.hs: + self.assertTrue(np.allclose(model.syn1, model2.syn1)) + if model.negative: + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) + most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] + self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) + + def testTraining(self): + model = FT_gensim(size=2, min_count=1, hs=1, negative=0) + model.build_vocab(sentences) + + self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.ngrams), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) + + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + sims = model.most_similar('graph', topn=10) + + # test querying for "most similar" by vector + graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] + sims2 = model.most_similar(positive=[graph_vector], topn=11) + sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself + self.assertEqual(sims, sims2) + + # build vocab and train in one step; must be the same as above + model2 = FT_gensim(sentences, size=2, min_count=1, hs=1, negative=0) + self.models_equal(model, model2) + + def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): + sims_gensim = model_gensim.most_similar('war', topn=50) + sims_wrapper = model_wrapper.most_similar('war', topn=50) + self.assertEqual(sims_gensim, sims_wrapper) + + def test_cbow_hs(self): + model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', + corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, + alpha=0.05, window=8, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + + model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, + min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0001) + + model_gensim.build_vocab(list_corpus) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(list_corpus, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[1]).all()) # vector should vary after training + + self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + def test_cbow_neg(self): + model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', + corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, + alpha=0.05, window=8, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=15, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + + model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=8, hs=0, negative=15, + min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0001) + + model_gensim.build_vocab(list_corpus) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(list_corpus, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[1]).all()) # vector should vary after training + + self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From 102c14a516ccad5eb1c686c218c09c456a6a965c Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 14 Aug 2017 00:00:30 +0530 Subject: [PATCH 02/32] updated unit tests for fasttext --- gensim/test/test_fasttext.py | 39 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 56da1bbdd8..ab79776e59 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -8,6 +8,7 @@ import numpy as np from gensim import utils +from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim from gensim.models.wrappers.fasttext import FastText as FT_wrapper @@ -69,39 +70,45 @@ def testTraining(self): self.models_equal(model, model2) def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): - sims_gensim = model_gensim.most_similar('war', topn=50) - sims_wrapper = model_wrapper.most_similar('war', topn=50) + sims_gensim = model_gensim.most_similar('night', topn=10) + sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) + + sims_wrapper = model_wrapper.most_similar('night', topn=10) + sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) + self.assertEqual(sims_gensim, sims_wrapper) def test_cbow_hs(self): model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, - alpha=0.05, window=8, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=12, min_alpha=0.0001) + model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, + min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0) - model_gensim.build_vocab(list_corpus) + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(list_corpus, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[1]).all()) # vector should vary after training + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.test_against_fasttext_wrapper(model_gensim, model_wrapper) def test_cbow_neg(self): model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, - alpha=0.05, window=8, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=15, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=15, iter=7, min_n=3, max_n=6, sorted_vocab=1, threads=1) - model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=8, hs=0, negative=15, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=12, min_alpha=0.0001) + model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=15, + min_count=1, iter=7, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0) - model_gensim.build_vocab(list_corpus) + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(list_corpus, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[1]).all()) # vector should vary after training + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.test_against_fasttext_wrapper(model_gensim, model_wrapper) From 4c449df021123387a9dc0b2454a4adefa2de5043 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 14 Aug 2017 00:13:34 +0530 Subject: [PATCH 03/32] corrected use of matrix and precomputed ngrams for vocab words --- gensim/models/fasttext.py | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4afa4e87e9..2fb40a6230 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -26,19 +26,20 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) + # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + # start = max(0, pos - model.window + reduced_window) + start = max(0, pos - model.window) + # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) + window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] + subwords_indices = [] - for indices in word2_indices: - word2_subwords += ['<' + model.wv.index2word[indices] + '>'] - word2_subwords += Ft_Wrapper.compute_ngrams(model.wv.index2word[indices], model.min_n, model.max_n) - word2_subwords = list(set(word2_subwords)) + for index in word2_indices: + subwords_indices += [index] + word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] - subwords_indices = [] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) @@ -134,6 +135,7 @@ def __init__( self.wv.min_n = min_n self.wv.max_n = max_n + self.wv.ngrams_word = {} if sentences is not None: if isinstance(sentences, GeneratorType): @@ -167,15 +169,12 @@ def __getitem__(self, word): def get_vocab_word_vecs(self): for w, v in self.wv.vocab.items(): - word_vec = np.zeros(self.wv.syn0_all.shape[1]) - ngrams = ['<' + w + '>'] - ngrams += Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) - ngrams = list(set(ngrams)) + word_vec = self.wv.syn0_all[v.index] + ngrams = self.wv.ngrams_word[w] ngram_weights = self.wv.syn0_all for ngram in ngrams: word_vec += ngram_weights[self.wv.ngrams[ngram]] - word_vec /= len(ngrams) - + word_vec /= (len(ngrams) + 1) self.wv.syn0[v.index] = word_vec def word_vec(self, word, use_norm=False): @@ -204,27 +203,28 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - # super(build_vocab, self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False) self.init_ngrams() def reset_ngram_weights(self): - for ngram in self.wv.ngrams: - self.wv.syn0_all[self.wv.ngrams[ngram]] = self.seeded_vector(ngram + str(self.seed)) + for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): + self.wv.syn0_all[index] = np.random.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) def init_ngrams(self): self.wv.ngrams = {} - self.wv.syn0_all = empty((self.bucket + len(self.wv.vocab), self.vector_size), dtype=REAL) - self.syn0_all_lockf = ones((self.bucket + len(self.wv.vocab), self.vector_size), dtype=REAL) + self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): - all_ngrams += ['<' + w + '>'] - all_ngrams += Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + all_ngrams += self.wv.ngrams_word[w] + all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams in the vocab is %d", self.num_ngram_vectors) - ngram_indices = [] + ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact + for i, ngram in enumerate(all_ngrams): ngram_hash = Ft_Wrapper.ft_hash(ngram) ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) From f49df5443f60d526dd5e275159ddc646010a565f Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 15 Aug 2017 16:47:47 +0530 Subject: [PATCH 04/32] added EOS token in 'LineSentence' class --- gensim/models/word2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b5b60361b1..ff6aedf563 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1539,6 +1539,7 @@ def __iter__(self): self.source.seek(0) for line in itertools.islice(self.source, self.limit): line = utils.to_unicode(line).split() + line.append('/s') i = 0 while i < len(line): yield line[i : i + self.max_sentence_length] @@ -1548,6 +1549,7 @@ def __iter__(self): with utils.smart_open(self.source) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() + line.append('/s') i = 0 while i < len(line): yield line[i:i + self.max_sentence_length] From 1fcb8fad6522787750642e85a42b74476d9ec8e4 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 16 Aug 2017 17:23:52 +0530 Subject: [PATCH 05/32] added skipgram training code --- gensim/models/fasttext.py | 88 +++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 2fb40a6230..01045d2a51 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -26,11 +26,11 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - # start = max(0, pos - model.window + reduced_window) - start = max(0, pos - model.window) - # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) + reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + start = max(0, pos - model.window + reduced_window) + # start = max(0, pos - model.window) + window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) + # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] @@ -85,6 +85,76 @@ def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors return neu1e +def train_batch_sg(model, sentences, alpha, work=None): + result = 0 + for sentence in sentences: + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + for pos, word in enumerate(word_vocabs): + reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + # now go over all words from the (reduced) window, predicting each one in turn + start = max(0, pos - model.window + reduced_window) + for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): + if pos2 != pos: # don't train on the `word` itself + subwords_indices = [word2.index] + word2_subwords = model.wv.ngrams_word[model.wv.index2word[word2.index]] + + for subword in word2_subwords: + subwords_indices.append(model.wv.ngrams[subword]) + + train_sg_pair(model, model.wv.index2word[word.index], subwords_indices, alpha) + + result += len(word_vocabs) + return result + +def train_sg_pair(model, word, input_subword_indices, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None): + if context_vectors is None: + context_vectors = model.wv.syn0_all + if context_locks is None: + context_locks = model.syn0_all_lockf + + if word not in model.wv.vocab: + return + predict_word = model.wv.vocab[word] # target word (NN output) + + l1 = np_sum(context_vectors[input_subword_indices], axis=0) + if input_subword_indices: + l1 /= len(input_subword_indices) + + neu1e = zeros(l1.shape) + + if model.hs: + # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) + l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size + fa = expit(dot(l1, l2a.T)) # propagate hidden -> output + ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate + if learn_hidden: + model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output + neu1e += dot(ga, l2a) # save error + + if model.negative: + # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) + word_indices = [predict_word.index] + while len(word_indices) < model.negative + 1: + w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) + if w != predict_word.index: + word_indices.append(w) + l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size + fb = expit(dot(l1, l2b.T)) # propagate hidden -> output + gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate + if learn_hidden: + model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output + neu1e += dot(gb, l2b) # save error + + if learn_vectors: + # l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) + if input_subword_indices: + neu1e /= len(input_subword_indices) + for i in input_subword_indices: + model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] + + return neu1e + class FastText(Word2Vec): def __init__( @@ -236,9 +306,9 @@ def init_ngrams(self): def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits tally = 0 - # if self.sg: - # tally += train_batch_sg(self, sentences, alpha, work) - # else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1) + if self.sg: + tally += train_batch_sg(self, sentences, alpha, work) + else: + tally += train_batch_cbow(self, sentences, alpha, work, neu1) return tally, self._raw_word_count(sentences) From 82fda3c139ef6d1a95e3f51f4fc234822842919a Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 16 Aug 2017 17:24:19 +0530 Subject: [PATCH 06/32] updated unit tests for fasttext --- gensim/test/test_fasttext.py | 74 ++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index ab79776e59..172a91bb01 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -14,6 +14,7 @@ module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) +logger = logging.getLogger(__name__) class LeeCorpus(object): def __iter__(self): @@ -35,9 +36,17 @@ def __iter__(self): ['graph', 'minors', 'survey'] ] +def testfile(): + # temporary data will be stored to this file + return os.path.join(tempfile.gettempdir(), 'gensim_fasttext.tst') + class TestFastTextModel(unittest.TestCase): + def setUp(self): + ft_home = os.environ.get('FT_HOME', None) + self.ft_exec_path = os.path.join(ft_home, 'fasttext') if ft_home else None + def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) @@ -53,7 +62,7 @@ def testTraining(self): model = FT_gensim(size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) - self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.ngrams), 2)) + self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.vocab) + len(model.wv.ngrams), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) @@ -69,7 +78,7 @@ def testTraining(self): model2 = FT_gensim(sentences, size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) - def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): + def test_against_fasttext_wrapper(model_gensim, model_wrapper): sims_gensim = model_gensim.most_similar('night', topn=10) sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) @@ -79,9 +88,13 @@ def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): self.assertEqual(sims_gensim, sims_wrapper) def test_cbow_hs(self): - model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', - corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, - alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) + if self.ft_exec_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, @@ -96,9 +109,13 @@ def test_cbow_hs(self): self.test_against_fasttext_wrapper(model_gensim, model_wrapper) def test_cbow_neg(self): - model_wrapper = FT_wrapper.train(ft_path='/home/chinmaya/GSOC/Gensim/fastText/fasttext', - corpus_file=datapath('lee_background.cor'), output_file='/home/chinmaya/GSOC/Gensim/fasttext_out1', model='cbow', size=50, - alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=15, iter=7, min_n=3, max_n=6, sorted_vocab=1, threads=1) + if self.ft_exec_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', + sample=1e-3, negative=15, iter=7, min_n=3, max_n=6, sorted_vocab=1, threads=1) model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=15, min_count=1, iter=7, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, @@ -112,6 +129,47 @@ def test_cbow_neg(self): self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + def test_sg_hs(self): + if self.ft_exec_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, + min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + + self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + def test_sg_neg(self): + if self.ft_exec_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + loss='ns', sample=1e-3, negative=15, iter=1, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, + min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + + self.test_against_fasttext_wrapper(model_gensim, model_wrapper) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From cd59034485ff87933a961b9dbc980b3af1e620ea Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 16 Aug 2017 19:08:51 +0530 Subject: [PATCH 07/32] seeded 'np.random' with 'self.seed' --- gensim/models/fasttext.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 01045d2a51..b6f12bbe4e 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -276,6 +276,7 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ self.init_ngrams() def reset_ngram_weights(self): + np.random.seed(self.seed) for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): self.wv.syn0_all[index] = np.random.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) From 353f7a8066957535be20ebf7b6e8ec1a3df17d89 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 17 Aug 2017 16:51:22 +0530 Subject: [PATCH 08/32] added test for persistence --- gensim/test/test_fasttext.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 172a91bb01..0509681877 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -3,6 +3,7 @@ import logging import unittest +import tempfile import os import numpy as np @@ -47,7 +48,7 @@ def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_exec_path = os.path.join(ft_home, 'fasttext') if ft_home else None - def models_equal(self, model, model2): + def modelsEqual(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) @@ -58,12 +59,14 @@ def models_equal(self, model, model2): most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) + def modelSanity(self, model): + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.vocab) + len(model.wv.ngrams), model.vector_size)) + def testTraining(self): model = FT_gensim(size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) - - self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.vocab) + len(model.wv.ngrams), 2)) - self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) + self.modelSanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) @@ -76,9 +79,9 @@ def testTraining(self): # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=2, min_count=1, hs=1, negative=0) - self.models_equal(model, model2) + self.modelsEqual(model, model2) - def test_against_fasttext_wrapper(model_gensim, model_wrapper): + def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): sims_gensim = model_gensim.most_similar('night', topn=10) sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) @@ -171,6 +174,20 @@ def test_sg_neg(self): self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + def testModelPersistence(self): + model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, + min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=1, min_alpha=0.0) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + + model_gensim.save(testfile()) + model_gensim_load = FT_gensim.load(testfile()) + most_similar_words = model_gensim_load.most_similar('night', topn=10) + self.assertTrue(len(most_similar_words) == 10) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 569a026044bdac061fb12f156d24216cd7dc5de9 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 17 Aug 2017 16:54:04 +0530 Subject: [PATCH 09/32] updated seeding numpy obj --- gensim/models/fasttext.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b6f12bbe4e..b812d130e2 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -276,9 +276,10 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ self.init_ngrams() def reset_ngram_weights(self): - np.random.seed(self.seed) + rand_obj = np.random + rand_obj.seed(self.seed) for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): - self.wv.syn0_all[index] = np.random.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) + self.wv.syn0_all[index] = rand_obj.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) def init_ngrams(self): self.wv.ngrams = {} From c228b8da7142999fd675a18cf5710ec5bcd372ab Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 24 Aug 2017 03:24:55 +0530 Subject: [PATCH 10/32] updated (unclean) fasttext code for review --- gensim/models/fasttext.py | 45 ++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b812d130e2..0981e97f16 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -23,14 +23,14 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab ]#and + # model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - start = max(0, pos - model.window + reduced_window) - # start = max(0, pos - model.window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) + # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + # start = max(0, pos - model.window + reduced_window) + start = max(0, pos - model.window) + # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) + window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] @@ -88,21 +88,28 @@ def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors def train_batch_sg(model, sentences, alpha, work=None): result = 0 for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and - model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] #and + # model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - model.window + reduced_window) - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): + # start = max(0, pos - model.window + reduced_window) + start = max(0, pos - model.window) + # for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): + + subwords_indices = [word.index] + word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] + for subword in word2_subwords: + subwords_indices.append(model.wv.ngrams[subword]) + + for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1)], start): if pos2 != pos: # don't train on the `word` itself - subwords_indices = [word2.index] - word2_subwords = model.wv.ngrams_word[model.wv.index2word[word2.index]] + # subwords_indices = [word2.index] + # word2_subwords = model.wv.ngrams_word[model.wv.index2word[word2.index]] - for subword in word2_subwords: - subwords_indices.append(model.wv.ngrams[subword]) - train_sg_pair(model, model.wv.index2word[word.index], subwords_indices, alpha) + # train_sg_pair(model, model.wv.index2word[word.index], subwords_indices, alpha) + train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha) result += len(word_vocabs) return result @@ -300,8 +307,8 @@ def init_ngrams(self): for i, ngram in enumerate(all_ngrams): ngram_hash = Ft_Wrapper.ft_hash(ngram) ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.ngrams[ngram] = i - + self.wv.ngrams[ngram] = i + len(self.wv.vocab) + self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) self.reset_ngram_weights() From 29c627fd058a24d801b6d46633a5452b3c9151d2 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 24 Aug 2017 13:13:07 +0530 Subject: [PATCH 11/32] updated fasttext tutorial notebook --- docs/notebooks/FastText_Tutorial.ipynb | 527 ++++++++++++++++++------- 1 file changed, 379 insertions(+), 148 deletions(-) diff --git a/docs/notebooks/FastText_Tutorial.ipynb b/docs/notebooks/FastText_Tutorial.ipynb index 7b98dffc97..a55f62d49e 100644 --- a/docs/notebooks/FastText_Tutorial.ipynb +++ b/docs/notebooks/FastText_Tutorial.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This tutorial is about using the Gensim wrapper for the [FastText](https://github.com/facebookresearch/fastText) library for training FastText models, loading them and performing similarity operations and vector lookups analogous to Word2Vec." + "This tutorial is about using [fastText](https://github.com/facebookresearch/fastText) model in Gensim. There are two ways you can use fastText in Gensim - Gensim's native implementation of fastText and Gensim wrapper for fastText's original C++ code. Here, we'll learn to work with fastText library for training word-embedding models, saving & loading them and performing similarity operations & vector lookups analogous to Word2Vec." ] }, { @@ -19,11 +19,11 @@ "metadata": {}, "source": [ "## When to use FastText?\n", - "The main principle behind FastText is that the morphological structure of a word carries important information about the meaning of the word, which is not taken into account by traditional word embeddings, which train a unique word embedding for every individual word. This is especially significant for morphologically rich languages (German, Turkish) in which a single word can have a large number of morphological forms, each of which might occur rarely, thus making it hard to train good word embeddings. \n", - "FastText attempts to solve this by treating each word as the aggregation of its subwords. For the sake of simplicity and language-independence, subwords are taken to the character ngrams of the word. The vector for a word is simply taken to be the sum of all vectors of its component char-ngrams. \n", - "According to a detailed comparison of Word2Vec and FastText in [this notebook](Word2Vec_FastText_Comparison.ipynb), FastText does significantly better on syntactic tasks as compared to the original Word2Vec, especially when the size of the training corpus is small. Word2Vec slightly outperforms FastText on semantic tasks though. The differences grow smaller as the size of training corpus increases. \n", - "Training time for FastText is significantly higher than the Gensim version of Word2Vec (`15min 42s` vs `6min 42s` on text8, 17 mil tokens, 5 epochs, and a vector size of 100). \n", - "FastText can be used to obtain vectors for out-of-vocabulary (oov) words, by summing up vectors for its component char-ngrams, provided at least one of the char-ngrams was present in the training data." + "The main principle behind fastText is that the morphological structure of a word carries important information about the meaning of the word, which is not taken into account by traditional word embeddings, which train a unique word embedding for every individual word. This is especially significant for morphologically rich languages (German, Turkish) in which a single word can have a large number of morphological forms, each of which might occur rarely, thus making it hard to train good word embeddings. \n", + "fastText attempts to solve this by treating each word as the aggregation of its subwords. For the sake of simplicity and language-independence, subwords are taken to be the character ngrams of the word. The vector for a word is simply taken to be the sum of all vectors of its component char-ngrams. \n", + "According to a detailed comparison of Word2Vec and FastText in [this notebook](Word2Vec_FastText_Comparison.ipynb), fastText does significantly better on syntactic tasks as compared to the original Word2Vec, especially when the size of the training corpus is small. Word2Vec slightly outperforms FastText on semantic tasks though. The differences grow smaller as the size of training corpus increases.\n", + "Training time for fastText is significantly higher than the Gensim version of Word2Vec (`15min 42s` vs `6min 42s` on text8, 17 mil tokens, 5 epochs, and a vector size of 100). \n", + "fastText can be used to obtain vectors for out-of-vocabulary (OOV) words, by summing up vectors for its component char-ngrams, provided at least one of the char-ngrams was present in the training data." ] }, { @@ -37,9 +37,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For the following examples, we'll use the Lee Corpus (which you already have if you've installed gensim)\n", + "For the following examples, we'll use the Lee Corpus (which you already have if you've installed gensim) for training our model.\n", "\n", - "You need to have FastText setup locally to be able to train models. See [installation instructions for FastText](https://github.com/facebookresearch/fastText/#requirements) if you don't have FastText installed." + "For using the wrapper for fastText, you need to have fastText setup locally to be able to train models. See [installation instructions for fastText](https://github.com/facebookresearch/fastText/#requirements) if you don't have fastText installed already." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Gensim's implementation of fastText" ] }, { @@ -47,51 +54,47 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "FastText(vocab=1762, size=100, alpha=0.025)\n" + "FastText(vocab=1763, size=100, alpha=0.025)\n" ] } ], "source": [ "import gensim, os\n", - "from gensim.models.wrappers.fasttext import FastText\n", - "\n", - "# Set FastText home to the path to the FastText executable\n", - "ft_home = '/home/jayant/Projects/fastText/fasttext'\n", + "from gensim.models.word2vec import LineSentence\n", + "from gensim.models.fasttext import FastText as FT_gensim\n", "\n", "# Set file names for train and test data\n", "data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data']) + os.sep\n", "lee_train_file = data_dir + 'lee_background.cor'\n", + "lee_data = LineSentence(lee_train_file)\n", + "\n", + "model_gensim = FT_gensim(size=100)\n", "\n", - "model = FastText.train(ft_home, lee_train_file)\n", + "# build the vocabulary\n", + "model_gensim.build_vocab(lee_data)\n", "\n", - "print(model)" + "# train the model\n", + "model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)\n", + "\n", + "print(model_gensim)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec - \n", - " - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)\n", - " - size: Size of embeddings to be learnt (Default 100)\n", - " - alpha: Initial learning rate (Default 0.025)\n", - " - window: Context window size (Default 5)\n", - " - min_count: Ignore words with number of occurrences below this (Default 5)\n", - " - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)\n", - " - sample: Threshold for downsampling higher-frequency words (Default 0.001)\n", - " - negative: Number of negative words to sample, for `ns` (Default 5)\n", - " - iter: Number of epochs (Default 5)\n", - " - sorted_vocab: Sort vocab by descending frequency (Default 1)\n", - " - threads: Number of threads to use (Default 12)\n", - " \n", - "In addition, FastText has two additional parameters - \n", - " - min_n: min length of char ngrams to be used (Default 3)\n", - " - max_n: max length of char ngrams to be used for (Default 6)\n", - "These control the lengths of character ngrams that each word is broken down into while training and looking up embeddings. If `max_n` is set to 0, or to be lesser than `min_n`, no character ngrams are used, and the model effectively reduces to Word2Vec." + "### Using wrapper for fastText's C++ code" ] }, { @@ -103,20 +106,60 @@ "name": "stdout", "output_type": "stream", "text": [ - "FastText(vocab=816, size=50, alpha=0.025)\n" + "FastText(vocab=1763, size=100, alpha=0.025)\n" ] } ], "source": [ - "model = FastText.train(ft_home, lee_train_file, size=50, alpha=0.05, min_count=10)\n", - "print(model)" + "from gensim.models.wrappers.fasttext import FastText as FT_wrapper\n", + "\n", + "# Set FastText home to the path to the FastText executable\n", + "ft_home = '/home/chinmaya/GSOC/Gensim/fastText/fasttext'\n", + "\n", + "# train the model\n", + "model_wrapper = FT_wrapper.train(ft_home, lee_train_file)\n", + "\n", + "print(model_wrapper)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training hyperparameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Continuation of training with FastText models is not supported." + "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the folllowing parameters from the original word2vec - \n", + " - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)\n", + " - size: Size of embeddings to be learnt (Default 100)\n", + " - alpha: Initial learning rate (Default 0.025)\n", + " - window: Context window size (Default 5)\n", + " - min_count: Ignore words with number of occurrences below this (Default 5)\n", + " - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)\n", + " - sample: Threshold for downsampling higher-frequency words (Default 0.001)\n", + " - negative: Number of negative words to sample, for `ns` (Default 5)\n", + " - iter: Number of epochs (Default 5)\n", + " - sorted_vocab: Sort vocab by descending frequency (Default 1)\n", + " - threads: Number of threads to use (Default 12)\n", + " \n", + "In addition, FastText has three additional parameters - \n", + " - min_n: min length of char ngrams (Default 3)\n", + " - max_n: max length of char ngrams (Default 6)\n", + " - bucket: number of buckets used for hashing ngrams (Default 2000000)\n", + "Parameters `min_n` and `max_n` control the lengths of character ngrams that each word is broken down into while training and looking up embeddings. If `max_n` is set to 0, or to be lesser than `min_n`, no character ngrams are used, and the model effectively reduces to Word2Vec.\n", + "\n", + "To bound the memory requirements of the model being trained, a hashing function is used that maps ngrams to integers in 1 to K. For hashing these character sequences, the [Fowler-Noll-Vo hashing function](http://www.isthe.com/chongo/tech/comp/fnv) (FNV-1a variant) is employed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note:** As in the case of Word2Vec, you can continue to train your model while using Gensim's native implementation of fastText. However, continuation of training with fastText models while using the wrapper is not supported." ] }, { @@ -142,13 +185,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "FastText(vocab=816, size=50, alpha=0.025)\n" + "FastText(vocab=1763, size=100, alpha=0.025)\n", + "FastText(vocab=1763, size=100, alpha=0.025)\n" ] } ], "source": [ - "model.save('saved_fasttext_model')\n", - "loaded_model = FastText.load('saved_fasttext_model')\n", + "# saving a model trained via Gensim's fastText implementation\n", + "model_gensim.save('saved_model_gensim')\n", + "loaded_model = FT_gensim.load('saved_model_gensim')\n", + "print(loaded_model)\n", + "\n", + "# saving a model trained via fastText wrapper\n", + "model_wrapper.save('saved_model_wrapper')\n", + "loaded_model = FT_wrapper.load('saved_model_wrapper')\n", "print(loaded_model)" ] }, @@ -164,6 +214,8 @@ "metadata": {}, "source": [ "## Word vector lookup\n", + "**Note:** Operations like word vector lookups and similarity queries can be performed in exactly the same manner for both the implementations of fastText so they have been demonstrated using only the fastText wrapper here.\n", + "\n", "FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word." ] }, @@ -178,32 +230,48 @@ "text": [ "True\n", "False\n", - "[-0.47196999 -0.17528 0.19518 -0.31948 0.42835999 0.083281\n", - " -0.15183 0.43415001 0.41251001 -0.10186 -0.54948997 0.12667\n", - " 0.14816 -0.065804 -0.21105 -0.42304999 0.011494 0.53068\n", - " -0.57410997 -0.53930998 -0.33537999 0.16154 0.12377 -0.23537\n", - " -0.14629 -0.34777001 0.27304 0.20597 0.12581 0.36671999\n", - " 0.32075 0.27351999 -0.13311 -0.04975 -0.52293003 -0.2766\n", - " 0.11863 -0.009231 -0.66074997 0.018031 0.57145 0.35547\n", - " 0.21588001 0.14431 -0.31865999 0.32027 0.55005002 0.19374999\n", - " 0.36609 -0.54184002]\n", - "[-0.4256132 -0.11521876 0.20166218 -0.34812452 0.30932881 0.02802653\n", - " -0.18951961 0.4175721 0.41008326 -0.09026544 -0.50756483 0.07746826\n", - " 0.09458492 0.01440104 -0.17157355 -0.35189211 0.00103696 0.50923289\n", - " -0.49944138 -0.38334864 -0.34287725 0.18023167 0.18014225 -0.22820314\n", - " -0.08267317 -0.31241801 0.26023088 0.20673522 0.07008089 0.31678561\n", - " 0.31590793 0.16198126 -0.09287339 -0.1722331 -0.43232849 -0.26644917\n", - " 0.10019614 0.08444232 -0.57080398 0.07581607 0.50339428 0.28109486\n", - " 0.05507131 0.10023506 -0.17840675 0.18620458 0.42583067 0.00790601\n", - " 0.2036875 -0.4925791 ]\n" + "[ 0.60971916 0.66131264 0.09225323 0.28898761 0.34161603 0.06163925\n", + " -0.10147806 -0.18834428 -0.26355353 0.46417126 0.20428349 0.08414238\n", + " -0.61960417 -0.2977576 -0.22102182 0.14144184 0.13698931 -0.24608244\n", + " -0.58096874 0.3039414 0.18766184 0.38110724 0.11518024 -0.75747257\n", + " -0.275776 -0.42740449 -0.00725944 -0.24556711 0.41061676 0.05050014\n", + " -0.71367824 0.05223881 -0.07810796 0.22933683 0.43850809 0.06360656\n", + " 0.43815458 0.11096461 0.29619065 0.38061273 0.26262566 -0.07368335\n", + " 0.33198604 -0.1431711 -0.04876067 -0.35243919 0.18561274 -0.70321769\n", + " -0.16492438 -0.28362423 0.08294757 0.49758917 -0.17844993 -0.02241638\n", + " 0.18489315 0.01197879 -0.22931916 0.45774016 -0.40240806 -0.16401663\n", + " -0.07500558 0.06775728 0.14273891 0.39902335 0.1906638 0.14533612\n", + " -0.70275193 -0.64343351 -0.18003808 0.45082757 -0.42847934 0.23554228\n", + " 0.03722449 -0.0726353 -0.20106563 -0.85182953 0.16529776 0.2167791\n", + " 0.01655668 -0.45087481 0.44368106 0.94318634 0.3191022 -0.78148538\n", + " 0.06931634 -0.02454508 -0.07709292 0.00889531 0.41768485 -0.4333123\n", + " 0.57354093 0.40387386 0.50435936 0.15307237 0.41140166 0.09306428\n", + " -0.6406759 -0.00130932 0.01818158 0.05408832]\n", + "[ 0.57120456 0.61710706 0.08425266 0.28013577 0.30789921 0.08454974\n", + " -0.05984595 -0.14644302 -0.23369177 0.42689164 0.18699257 0.09090185\n", + " -0.57885733 -0.28756606 -0.20198511 0.12675938 0.14102744 -0.22880791\n", + " -0.52516965 0.27686313 0.19865591 0.33872125 0.11230565 -0.74198454\n", + " -0.28486362 -0.40490177 -0.00606945 -0.18761727 0.40040097 0.06941447\n", + " -0.70890718 0.03646363 -0.0598574 0.19175974 0.4242314 0.05878129\n", + " 0.41432344 0.10394377 0.2668701 0.38148809 0.2761937 -0.06951485\n", + " 0.34113405 -0.12189032 -0.05861677 -0.33032765 0.16585448 -0.65862278\n", + " -0.18381383 -0.28438907 0.08867586 0.46635329 -0.18801565 -0.01610042\n", + " 0.1940661 0.03761584 -0.21442287 0.41826423 -0.38097134 -0.15111094\n", + " -0.08636253 0.07374192 0.12731727 0.40068088 0.18576843 0.13244282\n", + " -0.64814759 -0.62510144 -0.17045424 0.44949777 -0.39068545 0.19102012\n", + " 0.03177847 -0.06673145 -0.17997442 -0.81052922 0.15459165 0.21476634\n", + " -0.01961387 -0.43806009 0.40781115 0.88663652 0.29360816 -0.74157697\n", + " 0.04686275 -0.0396045 -0.06810026 0.00260469 0.40505417 -0.39977569\n", + " 0.5443192 0.38472273 0.48665705 0.12033045 0.40395209 0.10123577\n", + " -0.6243847 -0.02460667 0.00828873 0.04089492]\n" ] } ], "source": [ - "print('night' in model.wv.vocab)\n", - "print('nights' in model.wv.vocab)\n", - "print(model['night'])\n", - "print(model['nights'])" + "print('night' in model_wrapper.wv.vocab)\n", + "print('nights' in model_wrapper.wv.vocab)\n", + "print(model_wrapper['night'])\n", + "print(model_wrapper['nights'])" ] }, { @@ -225,17 +293,17 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'axe'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/home/jayant/Projects/gensim/gensim/models/word2vec.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 1304\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1305\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1306\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1307\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/home/jayant/Projects/gensim/gensim/models/keyedvectors.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/home/jayant/Projects/gensim/gensim/models/wrappers/fasttext.pyc\u001b[0m in \u001b[0;36mword_vec\u001b[0;34m(self, word, use_norm)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mword_vec\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mngrams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# No ngrams of the word are present in self.ngrams\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all ngrams for word %s absent from model'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minit_sims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmodel_wrapper\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'axe'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/word2vec.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 1280\u001b[0m \u001b[0mRefer\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mdocumentation\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mKeyedVectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1281\u001b[0m \"\"\"\n\u001b[0;32m-> 1282\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1284\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__contains__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/keyedvectors.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 590\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_vec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/chinmaya/GSOC/Gensim/gensim/gensim/models/wrappers/fasttext.pyc\u001b[0m in \u001b[0;36mword_vec\u001b[0;34m(self, word, use_norm)\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mword_vec\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mngrams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# No ngrams of the word are present in self.ngrams\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 94\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'all ngrams for word %s absent from model'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 95\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minit_sims\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'all ngrams for word axe absent from model'" ] } ], "source": [ "# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data\n", - "model['axe']" + "model_wrapper['axe']" ] }, { @@ -261,9 +329,9 @@ ], "source": [ "# Tests if word present in vocab\n", - "print(\"word\" in model.wv.vocab)\n", + "print(\"word\" in model_wrapper.wv.vocab)\n", "# Tests if vector present for word\n", - "print(\"word\" in model)" + "print(\"word\" in model_wrapper)" ] }, { @@ -277,7 +345,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Similarity operations work the same way as word2vec. Out-of-vocabulary words can also be used, provided they have atleast one character ngram present in the training data." + "Similarity operations work the same way as word2vec. **Out-of-vocabulary words can also be used, provided they have atleast one character ngram present in the training data.**" ] }, { @@ -296,27 +364,32 @@ { "data": { "text/plain": [ - "0.97944545147919504" + "0.9988949391617723" ] }, "execution_count": 7, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(\"nights\" in model.wv.vocab)\n", - "print(\"night\" in model.wv.vocab)\n", - "model.similarity(\"night\", \"nights\")" + "print(\"nights\" in model_wrapper.wv.vocab)\n", + "print(\"night\" in model_wrapper.wv.vocab)\n", + "model_wrapper.similarity(\"night\", \"nights\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Syntactically similar words generally have high similarity in FastText models, since a large number of the component char-ngrams will be the same. As a result, FastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided [here](Word2Vec_FastText_Comparison.ipynb).\n", - "\n", - "Other similarity operations -" + "Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided [here](Word2Vec_FastText_Comparison.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other similarity operations" ] }, { @@ -327,26 +400,26 @@ { "data": { "text/plain": [ - "[(u'12', 0.9912641048431396),\n", - " (u'across', 0.990070641040802),\n", - " (u'few', 0.9840448498725891),\n", - " (u'deaths', 0.9840392470359802),\n", - " (u'parts', 0.9835165739059448),\n", - " (u'One', 0.9833074808120728),\n", - " (u'running', 0.9832631349563599),\n", - " (u'2', 0.982011079788208),\n", - " (u'victory', 0.9806963801383972),\n", - " (u'each', 0.9789758920669556)]" + "[(u'bowler', 0.9999216198921204),\n", + " (u'flights', 0.999881386756897),\n", + " (u'dozens', 0.9998700618743896),\n", + " (u'each', 0.9998670220375061),\n", + " (u'weather', 0.9998487234115601),\n", + " (u'technology', 0.999805748462677),\n", + " (u'acting', 0.9998006820678711),\n", + " (u'dollars', 0.999785840511322),\n", + " (u'place,', 0.9997731447219849),\n", + " (u'custody', 0.9997485280036926)]" ] }, "execution_count": 8, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "# The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only\n", - "model.most_similar(\"nights\")" + "model_wrapper.most_similar(\"nights\")" ] }, { @@ -357,16 +430,16 @@ { "data": { "text/plain": [ - "0.97543218704680112" + "0.99936318443348537" ] }, "execution_count": 9, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])" + "model_wrapper.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])" ] }, { @@ -377,16 +450,16 @@ { "data": { "text/plain": [ - "'lunch'" + "'dinner'" ] }, "execution_count": 10, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "model.doesnt_match(\"breakfast cereal dinner lunch\".split())" + "model_wrapper.doesnt_match(\"breakfast cereal dinner lunch\".split())" ] }, { @@ -397,32 +470,46 @@ { "data": { "text/plain": [ - "[(u'against', 0.94775390625),\n", - " (u'after', 0.923099935054779),\n", - " (u'West', 0.910752534866333),\n", - " (u'again', 0.903070867061615),\n", - " (u'arrest', 0.8878517150878906),\n", - " (u'suicide', 0.8750319480895996),\n", - " (u'After', 0.8682445287704468),\n", - " (u'innings', 0.859328031539917),\n", - " (u'Test', 0.8542338609695435),\n", - " (u'during', 0.852535605430603)]" + "[(u'September', 0.9997114539146423),\n", + " (u'Rafter', 0.9996863007545471),\n", + " (u'New', 0.999636709690094),\n", + " (u'after', 0.9996317625045776),\n", + " (u'day', 0.9996190071105957),\n", + " (u'After', 0.9996107816696167),\n", + " (u'against', 0.9996088743209839),\n", + " (u'Robert', 0.9996023178100586),\n", + " (u'attacks', 0.9995726346969604),\n", + " (u'States', 0.9995641112327576)]" ] }, "execution_count": 11, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "model.most_similar(positive=['baghdad', 'england'], negative=['london'])" + "model_wrapper.most_similar(positive=['baghdad', 'england'], negative=['london'])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "family: 0.0% (0/2)\n", + "gram3-comparative: 0.0% (0/12)\n", + "gram4-superlative: 0.0% (0/12)\n", + "gram5-present-participle: 0.0% (0/20)\n", + "gram6-nationality-adjective: 0.0% (0/20)\n", + "gram7-past-tense: 0.0% (0/20)\n", + "gram8-plural: 0.0% (0/12)\n", + "total: 0.0% (0/98)\n" + ] + }, { "data": { "text/plain": [ @@ -436,107 +523,242 @@ " 'section': u'family'},\n", " {'correct': [], 'incorrect': [], 'section': u'gram1-adjective-to-adverb'},\n", " {'correct': [], 'incorrect': [], 'section': u'gram2-opposite'},\n", - " {'correct': [], 'incorrect': [], 'section': u'gram3-comparative'},\n", - " {'correct': [(u'BIG', u'BIGGEST', u'GOOD', u'BEST')],\n", - " 'incorrect': [(u'GOOD', u'BEST', u'BIG', u'BIGGEST')],\n", + " {'correct': [],\n", + " 'incorrect': [(u'GOOD', u'BETTER', u'GREAT', u'GREATER'),\n", + " (u'GOOD', u'BETTER', u'LONG', u'LONGER'),\n", + " (u'GOOD', u'BETTER', u'LOW', u'LOWER'),\n", + " (u'GREAT', u'GREATER', u'LONG', u'LONGER'),\n", + " (u'GREAT', u'GREATER', u'LOW', u'LOWER'),\n", + " (u'GREAT', u'GREATER', u'GOOD', u'BETTER'),\n", + " (u'LONG', u'LONGER', u'LOW', u'LOWER'),\n", + " (u'LONG', u'LONGER', u'GOOD', u'BETTER'),\n", + " (u'LONG', u'LONGER', u'GREAT', u'GREATER'),\n", + " (u'LOW', u'LOWER', u'GOOD', u'BETTER'),\n", + " (u'LOW', u'LOWER', u'GREAT', u'GREATER'),\n", + " (u'LOW', u'LOWER', u'LONG', u'LONGER')],\n", + " 'section': u'gram3-comparative'},\n", + " {'correct': [],\n", + " 'incorrect': [(u'BIG', u'BIGGEST', u'GOOD', u'BEST'),\n", + " (u'BIG', u'BIGGEST', u'GREAT', u'GREATEST'),\n", + " (u'BIG', u'BIGGEST', u'LARGE', u'LARGEST'),\n", + " (u'GOOD', u'BEST', u'GREAT', u'GREATEST'),\n", + " (u'GOOD', u'BEST', u'LARGE', u'LARGEST'),\n", + " (u'GOOD', u'BEST', u'BIG', u'BIGGEST'),\n", + " (u'GREAT', u'GREATEST', u'LARGE', u'LARGEST'),\n", + " (u'GREAT', u'GREATEST', u'BIG', u'BIGGEST'),\n", + " (u'GREAT', u'GREATEST', u'GOOD', u'BEST'),\n", + " (u'LARGE', u'LARGEST', u'BIG', u'BIGGEST'),\n", + " (u'LARGE', u'LARGEST', u'GOOD', u'BEST'),\n", + " (u'LARGE', u'LARGEST', u'GREAT', u'GREATEST')],\n", " 'section': u'gram4-superlative'},\n", - " {'correct': [(u'GO', u'GOING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", - " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", - " (u'SAY', u'SAYING', u'LOOK', u'LOOKING')],\n", + " {'correct': [],\n", " 'incorrect': [(u'GO', u'GOING', u'LOOK', u'LOOKING'),\n", + " (u'GO', u'GOING', u'PLAY', u'PLAYING'),\n", " (u'GO', u'GOING', u'RUN', u'RUNNING'),\n", + " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", + " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", + " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", + " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", + " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", + " (u'PLAY', u'PLAYING', u'GO', u'GOING'),\n", + " (u'PLAY', u'PLAYING', u'LOOK', u'LOOKING'),\n", + " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", " (u'RUN', u'RUNNING', u'GO', u'GOING'),\n", " (u'RUN', u'RUNNING', u'LOOK', u'LOOKING'),\n", + " (u'RUN', u'RUNNING', u'PLAY', u'PLAYING'),\n", " (u'SAY', u'SAYING', u'GO', u'GOING'),\n", + " (u'SAY', u'SAYING', u'LOOK', u'LOOKING'),\n", + " (u'SAY', u'SAYING', u'PLAY', u'PLAYING'),\n", " (u'SAY', u'SAYING', u'RUN', u'RUNNING')],\n", " 'section': u'gram5-present-participle'},\n", - " {'correct': [(u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", + " {'correct': [],\n", + " 'incorrect': [(u'AUSTRALIA', u'AUSTRALIAN', u'FRANCE', u'FRENCH'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'INDIA', u'INDIAN'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'SWITZERLAND', u'SWISS'),\n", + " (u'FRANCE', u'FRENCH', u'INDIA', u'INDIAN'),\n", + " (u'FRANCE', u'FRENCH', u'ISRAEL', u'ISRAELI'),\n", + " (u'FRANCE', u'FRENCH', u'SWITZERLAND', u'SWISS'),\n", + " (u'FRANCE', u'FRENCH', u'AUSTRALIA', u'AUSTRALIAN'),\n", " (u'INDIA', u'INDIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN')],\n", - " 'incorrect': [(u'AUSTRALIA', u'AUSTRALIAN', u'INDIA', u'INDIAN'),\n", + " (u'INDIA', u'INDIAN', u'SWITZERLAND', u'SWISS'),\n", + " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'INDIA', u'INDIAN', u'FRANCE', u'FRENCH'),\n", + " (u'ISRAEL', u'ISRAELI', u'SWITZERLAND', u'SWISS'),\n", " (u'ISRAEL', u'ISRAELI', u'AUSTRALIA', u'AUSTRALIAN'),\n", - " (u'ISRAEL', u'ISRAELI', u'INDIA', u'INDIAN')],\n", + " (u'ISRAEL', u'ISRAELI', u'FRANCE', u'FRENCH'),\n", + " (u'ISRAEL', u'ISRAELI', u'INDIA', u'INDIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'FRANCE', u'FRENCH'),\n", + " (u'SWITZERLAND', u'SWISS', u'INDIA', u'INDIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'ISRAEL', u'ISRAELI')],\n", " 'section': u'gram6-nationality-adjective'},\n", " {'correct': [],\n", - " 'incorrect': [(u'GOING', u'WENT', u'SAYING', u'SAID'),\n", + " 'incorrect': [(u'GOING', u'WENT', u'PAYING', u'PAID'),\n", + " (u'GOING', u'WENT', u'PLAYING', u'PLAYED'),\n", + " (u'GOING', u'WENT', u'SAYING', u'SAID'),\n", " (u'GOING', u'WENT', u'TAKING', u'TOOK'),\n", + " (u'PAYING', u'PAID', u'PLAYING', u'PLAYED'),\n", + " (u'PAYING', u'PAID', u'SAYING', u'SAID'),\n", + " (u'PAYING', u'PAID', u'TAKING', u'TOOK'),\n", + " (u'PAYING', u'PAID', u'GOING', u'WENT'),\n", + " (u'PLAYING', u'PLAYED', u'SAYING', u'SAID'),\n", + " (u'PLAYING', u'PLAYED', u'TAKING', u'TOOK'),\n", + " (u'PLAYING', u'PLAYED', u'GOING', u'WENT'),\n", + " (u'PLAYING', u'PLAYED', u'PAYING', u'PAID'),\n", " (u'SAYING', u'SAID', u'TAKING', u'TOOK'),\n", " (u'SAYING', u'SAID', u'GOING', u'WENT'),\n", + " (u'SAYING', u'SAID', u'PAYING', u'PAID'),\n", + " (u'SAYING', u'SAID', u'PLAYING', u'PLAYED'),\n", " (u'TAKING', u'TOOK', u'GOING', u'WENT'),\n", + " (u'TAKING', u'TOOK', u'PAYING', u'PAID'),\n", + " (u'TAKING', u'TOOK', u'PLAYING', u'PLAYED'),\n", " (u'TAKING', u'TOOK', u'SAYING', u'SAID')],\n", " 'section': u'gram7-past-tense'},\n", " {'correct': [],\n", - " 'incorrect': [(u'BUILDING', u'BUILDINGS', u'CHILD', u'CHILDREN'),\n", + " 'incorrect': [(u'BUILDING', u'BUILDINGS', u'CAR', u'CARS'),\n", + " (u'BUILDING', u'BUILDINGS', u'CHILD', u'CHILDREN'),\n", " (u'BUILDING', u'BUILDINGS', u'MAN', u'MEN'),\n", + " (u'CAR', u'CARS', u'CHILD', u'CHILDREN'),\n", + " (u'CAR', u'CARS', u'MAN', u'MEN'),\n", + " (u'CAR', u'CARS', u'BUILDING', u'BUILDINGS'),\n", " (u'CHILD', u'CHILDREN', u'MAN', u'MEN'),\n", " (u'CHILD', u'CHILDREN', u'BUILDING', u'BUILDINGS'),\n", + " (u'CHILD', u'CHILDREN', u'CAR', u'CARS'),\n", " (u'MAN', u'MEN', u'BUILDING', u'BUILDINGS'),\n", + " (u'MAN', u'MEN', u'CAR', u'CARS'),\n", " (u'MAN', u'MEN', u'CHILD', u'CHILDREN')],\n", " 'section': u'gram8-plural'},\n", " {'correct': [], 'incorrect': [], 'section': u'gram9-plural-verbs'},\n", - " {'correct': [(u'BIG', u'BIGGEST', u'GOOD', u'BEST'),\n", - " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", - " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", - " (u'SAY', u'SAYING', u'LOOK', u'LOOKING'),\n", - " (u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'INDIA', u'INDIAN', u'ISRAEL', u'ISRAELI'),\n", - " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN')],\n", + " {'correct': [],\n", " 'incorrect': [(u'HE', u'SHE', u'HIS', u'HER'),\n", " (u'HIS', u'HER', u'HE', u'SHE'),\n", + " (u'GOOD', u'BETTER', u'GREAT', u'GREATER'),\n", + " (u'GOOD', u'BETTER', u'LONG', u'LONGER'),\n", + " (u'GOOD', u'BETTER', u'LOW', u'LOWER'),\n", + " (u'GREAT', u'GREATER', u'LONG', u'LONGER'),\n", + " (u'GREAT', u'GREATER', u'LOW', u'LOWER'),\n", + " (u'GREAT', u'GREATER', u'GOOD', u'BETTER'),\n", + " (u'LONG', u'LONGER', u'LOW', u'LOWER'),\n", + " (u'LONG', u'LONGER', u'GOOD', u'BETTER'),\n", + " (u'LONG', u'LONGER', u'GREAT', u'GREATER'),\n", + " (u'LOW', u'LOWER', u'GOOD', u'BETTER'),\n", + " (u'LOW', u'LOWER', u'GREAT', u'GREATER'),\n", + " (u'LOW', u'LOWER', u'LONG', u'LONGER'),\n", + " (u'BIG', u'BIGGEST', u'GOOD', u'BEST'),\n", + " (u'BIG', u'BIGGEST', u'GREAT', u'GREATEST'),\n", + " (u'BIG', u'BIGGEST', u'LARGE', u'LARGEST'),\n", + " (u'GOOD', u'BEST', u'GREAT', u'GREATEST'),\n", + " (u'GOOD', u'BEST', u'LARGE', u'LARGEST'),\n", " (u'GOOD', u'BEST', u'BIG', u'BIGGEST'),\n", + " (u'GREAT', u'GREATEST', u'LARGE', u'LARGEST'),\n", + " (u'GREAT', u'GREATEST', u'BIG', u'BIGGEST'),\n", + " (u'GREAT', u'GREATEST', u'GOOD', u'BEST'),\n", + " (u'LARGE', u'LARGEST', u'BIG', u'BIGGEST'),\n", + " (u'LARGE', u'LARGEST', u'GOOD', u'BEST'),\n", + " (u'LARGE', u'LARGEST', u'GREAT', u'GREATEST'),\n", " (u'GO', u'GOING', u'LOOK', u'LOOKING'),\n", + " (u'GO', u'GOING', u'PLAY', u'PLAYING'),\n", " (u'GO', u'GOING', u'RUN', u'RUNNING'),\n", + " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", + " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", + " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", + " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", + " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", + " (u'PLAY', u'PLAYING', u'GO', u'GOING'),\n", + " (u'PLAY', u'PLAYING', u'LOOK', u'LOOKING'),\n", + " (u'RUN', u'RUNNING', u'SAY', u'SAYING'),\n", " (u'RUN', u'RUNNING', u'GO', u'GOING'),\n", " (u'RUN', u'RUNNING', u'LOOK', u'LOOKING'),\n", + " (u'RUN', u'RUNNING', u'PLAY', u'PLAYING'),\n", " (u'SAY', u'SAYING', u'GO', u'GOING'),\n", + " (u'SAY', u'SAYING', u'LOOK', u'LOOKING'),\n", + " (u'SAY', u'SAYING', u'PLAY', u'PLAYING'),\n", " (u'SAY', u'SAYING', u'RUN', u'RUNNING'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'FRANCE', u'FRENCH'),\n", " (u'AUSTRALIA', u'AUSTRALIAN', u'INDIA', u'INDIAN'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'ISRAEL', u'ISRAELI'),\n", + " (u'AUSTRALIA', u'AUSTRALIAN', u'SWITZERLAND', u'SWISS'),\n", + " (u'FRANCE', u'FRENCH', u'INDIA', u'INDIAN'),\n", + " (u'FRANCE', u'FRENCH', u'ISRAEL', u'ISRAELI'),\n", + " (u'FRANCE', u'FRENCH', u'SWITZERLAND', u'SWISS'),\n", + " (u'FRANCE', u'FRENCH', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'INDIA', u'INDIAN', u'ISRAEL', u'ISRAELI'),\n", + " (u'INDIA', u'INDIAN', u'SWITZERLAND', u'SWISS'),\n", + " (u'INDIA', u'INDIAN', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'INDIA', u'INDIAN', u'FRANCE', u'FRENCH'),\n", + " (u'ISRAEL', u'ISRAELI', u'SWITZERLAND', u'SWISS'),\n", " (u'ISRAEL', u'ISRAELI', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'ISRAEL', u'ISRAELI', u'FRANCE', u'FRENCH'),\n", " (u'ISRAEL', u'ISRAELI', u'INDIA', u'INDIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'AUSTRALIA', u'AUSTRALIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'FRANCE', u'FRENCH'),\n", + " (u'SWITZERLAND', u'SWISS', u'INDIA', u'INDIAN'),\n", + " (u'SWITZERLAND', u'SWISS', u'ISRAEL', u'ISRAELI'),\n", + " (u'GOING', u'WENT', u'PAYING', u'PAID'),\n", + " (u'GOING', u'WENT', u'PLAYING', u'PLAYED'),\n", " (u'GOING', u'WENT', u'SAYING', u'SAID'),\n", " (u'GOING', u'WENT', u'TAKING', u'TOOK'),\n", + " (u'PAYING', u'PAID', u'PLAYING', u'PLAYED'),\n", + " (u'PAYING', u'PAID', u'SAYING', u'SAID'),\n", + " (u'PAYING', u'PAID', u'TAKING', u'TOOK'),\n", + " (u'PAYING', u'PAID', u'GOING', u'WENT'),\n", + " (u'PLAYING', u'PLAYED', u'SAYING', u'SAID'),\n", + " (u'PLAYING', u'PLAYED', u'TAKING', u'TOOK'),\n", + " (u'PLAYING', u'PLAYED', u'GOING', u'WENT'),\n", + " (u'PLAYING', u'PLAYED', u'PAYING', u'PAID'),\n", " (u'SAYING', u'SAID', u'TAKING', u'TOOK'),\n", " (u'SAYING', u'SAID', u'GOING', u'WENT'),\n", + " (u'SAYING', u'SAID', u'PAYING', u'PAID'),\n", + " (u'SAYING', u'SAID', u'PLAYING', u'PLAYED'),\n", " (u'TAKING', u'TOOK', u'GOING', u'WENT'),\n", + " (u'TAKING', u'TOOK', u'PAYING', u'PAID'),\n", + " (u'TAKING', u'TOOK', u'PLAYING', u'PLAYED'),\n", " (u'TAKING', u'TOOK', u'SAYING', u'SAID'),\n", + " (u'BUILDING', u'BUILDINGS', u'CAR', u'CARS'),\n", " (u'BUILDING', u'BUILDINGS', u'CHILD', u'CHILDREN'),\n", " (u'BUILDING', u'BUILDINGS', u'MAN', u'MEN'),\n", + " (u'CAR', u'CARS', u'CHILD', u'CHILDREN'),\n", + " (u'CAR', u'CARS', u'MAN', u'MEN'),\n", + " (u'CAR', u'CARS', u'BUILDING', u'BUILDINGS'),\n", " (u'CHILD', u'CHILDREN', u'MAN', u'MEN'),\n", " (u'CHILD', u'CHILDREN', u'BUILDING', u'BUILDINGS'),\n", + " (u'CHILD', u'CHILDREN', u'CAR', u'CARS'),\n", " (u'MAN', u'MEN', u'BUILDING', u'BUILDINGS'),\n", + " (u'MAN', u'MEN', u'CAR', u'CARS'),\n", " (u'MAN', u'MEN', u'CHILD', u'CHILDREN')],\n", " 'section': 'total'}]" ] }, - "execution_count": 13, - "output_type": "execute_result", - "metadata": {} + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "model.accuracy(questions='questions-words.txt')" + "question_file_path = data_dir + 'questions-words.txt'\n", + "\n", + "model_wrapper.accuracy(questions=question_file_path)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7756597547632444" + "1.1102867164706653" ] }, - "execution_count": 15, - "output_type": "execute_result", - "metadata": {} + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -551,9 +773,18 @@ "sentence_president = [w for w in sentence_president if w not in stopwords]\n", "\n", "# Compute WMD.\n", - "distance = model.wmdistance(sentence_obama, sentence_president)\n", + "distance = model_wrapper.wmdistance(sentence_obama, sentence_president)\n", "distance" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { @@ -566,16 +797,16 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2.0 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.6" + "version": "2.7.13" } }, "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "nbformat_minor": 1 +} From acbfdf2a34a98802b8ab0d439261fa378349490a Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 24 Aug 2017 20:14:15 +0530 Subject: [PATCH 12/32] added 'save' and 'load_fasttext_format' functions --- gensim/models/fasttext.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 0981e97f16..3fb407c794 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -99,11 +99,13 @@ def train_batch_sg(model, sentences, alpha, work=None): subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] + # print("word2_subwords: ", word2_subwords) for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1)], start): if pos2 != pos: # don't train on the `word` itself + # print("sending pair : ", model.wv.index2word[word2.index], " , ", model.wv.index2word[word.index]) # subwords_indices = [word2.index] # word2_subwords = model.wv.ngrams_word[model.wv.index2word[word2.index]] @@ -299,7 +301,7 @@ def init_ngrams(self): all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(all_ngrams) + self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) logger.info("Total number of ngrams in the vocab is %d", self.num_ngram_vectors) ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact @@ -321,3 +323,13 @@ def _do_train_job(self, sentences, alpha, inits): tally += train_batch_cbow(self, sentences, alpha, work, neu1) return tally, self._raw_word_count(sentences) + + @classmethod + def load_fasttext_format(cls, *args, **kwargs): + return Ft_Wrapper.load_fasttext_format(*args, **kwargs) + + def save(self, *args, **kwargs): + # don't bother storing the cached normalized vectors, recalculable table + kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm']) + + super(FastText, self).save(*args, **kwargs) From cb7a2adbc0f86c43198b8691732f2e92bcb650a3 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 24 Aug 2017 20:14:45 +0530 Subject: [PATCH 13/32] updated unit tests for fasttext --- gensim/test/test_fasttext.py | 631 ++++++++++++++++++++++++++++------- 1 file changed, 516 insertions(+), 115 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 0509681877..d1185c793c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -11,7 +11,8 @@ from gensim import utils from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim -from gensim.models.wrappers.fasttext import FastText as FT_wrapper +from gensim.models.wrappers.fasttext import FastTextKeyedVectors +# from gensim.models.wrappers.fasttext import FastText as FT_wrapper module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -37,6 +38,15 @@ def __iter__(self): ['graph', 'minors', 'survey'] ] +new_sentences = [ + ['computer', 'artificial', 'intelligence'], + ['artificial', 'trees'], + ['human', 'intelligence'], + ['artificial', 'graph'], + ['intelligence'], + ['artificial', 'intelligence', 'system'] +] + def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_fasttext.tst') @@ -45,32 +55,26 @@ def testfile(): class TestFastTextModel(unittest.TestCase): def setUp(self): - ft_home = os.environ.get('FT_HOME', None) - self.ft_exec_path = os.path.join(ft_home, 'fasttext') if ft_home else None - - def modelsEqual(self, model, model2): - self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) - self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) - self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) - if model.hs: - self.assertTrue(np.allclose(model.syn1, model2.syn1)) - if model.negative: - self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) - most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] - self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) - - def modelSanity(self, model): - self.assertTrue(model.syn1.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.wv.syn0_all.shape == (len(model.wv.vocab) + len(model.wv.ngrams), model.vector_size)) + self.test_model_file = datapath('lee_fasttext') + self.test_model = FT_gensim.load(self.test_model_file) + self.test_new_model_file = datapath('lee_fasttext_new') + # ft_home = os.environ.get('FT_HOME', None) + # self.ft_exec_path = os.path.join(ft_home, 'fasttext') if ft_home else None + # self.ft_exec_path = '/home/chinmaya/GSOC/Gensim/fastText/fasttext' def testTraining(self): - model = FT_gensim(size=2, min_count=1, hs=1, negative=0) + model = FT_gensim(size=10, min_count=1, hs=1, negative=0) model.build_vocab(sentences) self.modelSanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) + self.assertEqual(model.wv.syn0.shape, (12, 10)) + self.assertEqual(len(model.wv.vocab), 12) + self.assertEqual(model.wv.syn0_all.shape[1], 10) + self.modelSanity(model) + # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) @@ -78,115 +82,512 @@ def testTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(sentences, size=2, min_count=1, hs=1, negative=0) + model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0) self.modelsEqual(model, model2) - def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): - sims_gensim = model_gensim.most_similar('night', topn=10) - sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) - - sims_wrapper = model_wrapper.most_similar('night', topn=10) - sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) + def testOnlineLearning(self): + model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) + model_neg = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=0, negative=5) + self.assertTrue(len(model_hs.wv.vocab), 12) + self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + model_hs.build_vocab(new_sentences, update=True) + model_neg.build_vocab(new_sentences, update=True) + self.assertTrue(model_hs.wv.vocab['graph'].count, 4) + self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(len(model_hs.wv.vocab), 14) + self.assertEqual(len(model_neg.wv.vocab), 14) + + + def testOnlineLearningAfterSave(self): + model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg.save(testfile()) + model_neg = FT_gensim.load(testfile()) + self.assertTrue(len(model_neg.wv.vocab), 12) + model_neg.build_vocab(new_sentences, update=True) + model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) + self.assertEqual(len(model_neg.wv.vocab), 14) + + def onlineSanity(self, model): + terro, others = [], [] + for l in list_corpus: + if 'terrorism' in l: + terro.append(l) + else: + others.append(l) + self.assertTrue(all(['terrorism' not in l for l in others])) + model.build_vocab(others) + model.train(others, total_examples=model.corpus_count, epochs=model.iter) + self.assertFalse('terrorism' in model.wv.vocab) + model.build_vocab(terro, update=True) + self.assertTrue('terrorism' in model.wv.vocab) + orig0_all = np.copy(model.wv.syn0_all) + model.train(terro, total_examples=len(terro), epochs=model.iter) + self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) + sim = model.n_similarity(['war'], ['terrorism']) + self.assertLess(0., sim) + + def test_sg_hs_online(self): + model = FT_gensim(sg=1, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + self.onlineSanity(model) + + def test_sg_neg_online(self): + model = FT_gensim(sg=1, window=4, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) + self.onlineSanity(model) + + def test_cbow_hs_online(self): + model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + self.onlineSanity(model) + + def test_cbow_neg_online(self): + model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) + self.onlineSanity(model) - self.assertEqual(sims_gensim, sims_wrapper) + def modelsEqual(self, model, model2): + self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) + self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) + self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) + self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) + if model.hs: + self.assertTrue(np.allclose(model.syn1, model2.syn1)) + if model.negative: + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) + most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] + self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) - def test_cbow_hs(self): - if self.ft_exec_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return + def testPersistence(self): + model = FT_gensim(sentences, min_count=1) + model.save(testfile()) + self.modelsEqual(model, FT_gensim.load(testfile())) + # test persistence of the KeyedVectors of a model + wv = model.wv + wv.save(testfile()) + loaded_wv = FastTextKeyedVectors.load(testfile()) + self.assertTrue(np.allclose(wv.syn0_all, loaded_wv.syn0_all)) + self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) + self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams)) + + def testNormVectorsNotSaved(self): + model = FT_gensim(sentences, min_count=1) + model.init_sims() + model.save(testfile()) + loaded_model = FT_gensim.load(testfile()) + self.assertTrue(loaded_model.wv.syn0norm is None) + self.assertTrue(loaded_model.wv.syn0_all_norm is None) + + wv = model.wv + wv.save(testfile()) + loaded_kv = FastTextKeyedVectors.load(testfile()) + self.assertTrue(loaded_kv.syn0norm is None) + self.assertTrue(loaded_kv.syn0_all_norm is None) - model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) + def modelSanity(self, model): + self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) + self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model.vector_size)) + + def testLoadFastTextFormat(self): + try: + model = FT_gensim.load_fasttext_format(self.test_model_file) + except Exception as exc: + self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) + vocab_size, model_size = 1762, 10 + self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) + self.assertEqual(len(model.wv.vocab), vocab_size, model_size) + self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model_size)) + + expected_vec = [ + -0.57144, + -0.0085561, + 0.15748, + -0.67855, + -0.25459, + -0.58077, + -0.09913, + 1.1447, + 0.23418, + 0.060007 + ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin + self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4)) + + # vector for oov words are slightly different from original FastText due to discarding unused ngrams + # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin + expected_vec_oov = [ + -0.23825, + -0.58482, + -0.22276, + -0.41215, + 0.91015, + -1.6786, + -0.26724, + 0.58818, + 0.57828, + 0.75801 + ] + self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) + + self.assertEquals(model.min_count, 5) + self.assertEquals(model.window, 5) + self.assertEquals(model.iter, 5) + self.assertEquals(model.negative, 5) + self.assertEquals(model.sample, 0.0001) + self.assertEquals(model.bucket, 1000) + self.assertEquals(model.wv.max_n, 6) + self.assertEquals(model.wv.min_n, 3) + self.modelSanity(model) - model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, - min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=12, min_alpha=0.0) + def testLoadFastTextNewFormat(self): + try: + new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) + except Exception as exc: + self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) + vocab_size, model_size = 1763, 10 + self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) + self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) + self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, model_size)) + + expected_vec = [ + -0.025627, + -0.11448, + 0.18116, + -0.96779, + 0.2532, + -0.93224, + 0.3929, + 0.12679, + -0.19685, + -0.13179 + ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin + self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4)) + + # vector for oov words are slightly different from original FastText due to discarding unused ngrams + # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin + expected_vec_oov = [ + -0.53378, + -0.19, + 0.013482, + -0.86767, + -0.21684, + -0.89928, + 0.45124, + 0.18025, + -0.14128, + 0.22508 + ] + self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) + + self.assertEquals(new_model.min_count, 5) + self.assertEquals(new_model.window, 5) + self.assertEquals(new_model.iter, 5) + self.assertEquals(new_model.negative, 5) + self.assertEquals(new_model.sample, 0.0001) + self.assertEquals(new_model.bucket, 1000) + self.assertEquals(new_model.wv.max_n, 6) + self.assertEquals(new_model.wv.min_n, 3) + self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) + self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, new_model.vector_size)) + # self.modelSanity(new_model) + + def testLoadModelWithNonAsciiVocab(self): + model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) + self.assertTrue(u'který' in model) + try: + vector = model[u'který'] + except UnicodeDecodeError: + self.fail('Unable to access vector for utf8 encoded non-ascii word') + + def testLoadModelNonUtf8Encoding(self): + model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') + self.assertTrue(u'který' in model) + try: + vector = model[u'který'] + except KeyError: + self.fail('Unable to access vector for cp-852 word') + + def testNSimilarity(self): + # In vocab, sanity check + self.assertTrue(np.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) + self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) + # Out of vocab check + self.assertTrue(np.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) + self.assertEqual(self.test_model.n_similarity(['night'], ['nights']), self.test_model.n_similarity(['nights'], ['night'])) + + def testSimilarity(self): + # In vocab, sanity check + self.assertTrue(np.allclose(self.test_model.similarity('the', 'the'), 1.0)) + self.assertEqual(self.test_model.similarity('the', 'and'), self.test_model.similarity('and', 'the')) + # Out of vocab check + self.assertTrue(np.allclose(self.test_model.similarity('nights', 'nights'), 1.0)) + self.assertEqual(self.test_model.similarity('night', 'nights'), self.test_model.similarity('nights', 'night')) + + def testMostSimilar(self): + # In vocab, sanity check + self.assertEqual(len(self.test_model.most_similar(positive=['the', 'and'], topn=5)), 5) + self.assertEqual(self.test_model.most_similar('the'), self.test_model.most_similar(positive=['the'])) + # Out of vocab check + self.assertEqual(len(self.test_model.most_similar(['night', 'nights'], topn=5)), 5) + self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights'])) + + + def testMostSimilarCosmul(self): + # In vocab, sanity check + self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5) + self.assertEqual( + self.test_model.most_similar_cosmul('the'), + self.test_model.most_similar_cosmul(positive=['the'])) + # Out of vocab check + self.assertEqual(len(self.test_model.most_similar_cosmul(['night', 'nights'], topn=5)), 5) + self.assertEqual( + self.test_model.most_similar_cosmul('nights'), + self.test_model.most_similar_cosmul(positive=['nights'])) + + def testLookup(self): + # In vocab, sanity check + self.assertTrue('night' in self.test_model.wv.vocab) + self.assertTrue(np.allclose(self.test_model['night'], self.test_model[['night']])) + # Out of vocab check + self.assertFalse('nights' in self.test_model.wv.vocab) + self.assertTrue(np.allclose(self.test_model['nights'], self.test_model[['nights']])) + # Word with no ngrams in model + self.assertRaises(KeyError, lambda: self.test_model['a!@']) + + def testContains(self): + # In vocab, sanity check + self.assertTrue('night' in self.test_model.wv.vocab) + self.assertTrue('night' in self.test_model) + # Out of vocab check + self.assertFalse('nights' in self.test_model.wv.vocab) + self.assertTrue('nights' in self.test_model) + # Word with no ngrams in model + self.assertFalse('a!@' in self.test_model.wv.vocab) + self.assertFalse('a!@' in self.test_model) + + def testWmdistance(self): + doc = ['night', 'payment'] + oov_doc = ['nights', 'forests', 'payments'] + ngrams_absent_doc = ['a!@', 'b#$'] + + dist = self.test_model.wmdistance(doc, oov_doc) + self.assertNotEqual(float('inf'), dist) + dist = self.test_model.wmdistance(doc, ngrams_absent_doc) + self.assertEqual(float('inf'), dist) + + def testDoesntMatch(self): + oov_words = ['nights', 'forests', 'payments'] + # Out of vocab check + for word in oov_words: + self.assertFalse(word in self.test_model.wv.vocab) + try: + self.test_model.doesnt_match(oov_words) + except Exception: + self.fail('model.doesnt_match raises exception for oov words') + + # def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): + # sims_gensim = model_gensim.most_similar('night', topn=10) + # sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) + + # sims_wrapper = model_wrapper.most_similar('night', topn=10) + # sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) + + # print(sims_gensim) + # print + # print(sims_wrapper) + # print + # print(sims_gensim_words) + # print + # print(sims_wrapper_words) + # print + # print(set(sims_gensim_words).intersection(sims_wrapper_words)) + # print + + # self.assertEqual(sims_gensim, sims_wrapper) + + # def test_cbow_hs(self): + # if self.ft_exec_path is None: + # logger.info("FT_HOME env variable not set, skipping test") + # return + + # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), + # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + # loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, + # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # sorted_vocab=1, workers=12, min_alpha=0.0) - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + # lee_data = LineSentence(datapath('lee_background.cor')) + # model_gensim.build_vocab(lee_data) + # orig0 = np.copy(model_gensim.wv.syn0[0]) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - def test_cbow_neg(self): - if self.ft_exec_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return + # def test_cbow_neg(self): + # # if self.ft_exec_path is None: + # # logger.info("FT_HOME env variable not set, skipping test") + # # return + # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" - model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', - sample=1e-3, negative=15, iter=7, min_n=3, max_n=6, sorted_vocab=1, threads=1) + # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', + # sample=1e-3, negative=15, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=15, - min_count=1, iter=7, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0) + # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=15, + # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + # lee_data = LineSentence(datapath('lee_background.cor')) + # model_gensim.build_vocab(lee_data) + # orig0 = np.copy(model_gensim.wv.syn0[0]) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + # def test_sg_hs(self): + # if self.ft_exec_path is None: + # logger.info("FT_HOME env variable not set, skipping test") + # return + + # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + # output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + # loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, + # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # sorted_vocab=1, workers=12, min_alpha=0.0) + + # lee_data = LineSentence(datapath('lee_background.cor')) + # model_gensim.build_vocab(lee_data) + # orig0 = np.copy(model_gensim.wv.syn0[0]) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + # def test_sg_neg(self): + # # if self.ft_exec_path is None: + # # logger.info("FT_HOME env variable not set, skipping test") + # # return + # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" + + # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), + # output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, + # loss='ns', sample=1e-3, negative=15, iter=10, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, + # min_count=5, iter=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) + + # lee_data = LineSentence(datapath('lee_background.cor')) + # model_gensim.build_vocab(lee_data) + # orig0 = np.copy(model_gensim.wv.syn0[0]) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + # def testModelPersistence(self): + # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, + # min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) + + # lee_data = LineSentence(datapath('lee_background.cor')) + # model_gensim.build_vocab(lee_data) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + + # model_gensim.save(testfile()) + # model_gensim_load = FT_gensim.load(testfile()) + # most_similar_words = model_gensim_load.most_similar('night', topn=10) + # self.assertTrue(len(most_similar_words) == 10) + + # def test_sample(self): + # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" + # # train_file_path = "/home/chinmaya/GSOC/Gensim/text8_100000" + # train_file_path = "/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/lee_background.cor" + # # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, + # # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=5, min_count=1, word_ngrams=1, + # # loss='ns', sample=1e-3, negative=15, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + + # # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, + # # min_count=1, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + # # sorted_vocab=1, workers=12, min_alpha=0.0) + + # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, + # output_file=testfile(), model='skipgram', size=100, alpha=0.025, window=1, min_count=1, word_ngrams=1, + # loss='hs', sample=0, negative=0, iter=10, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=100, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, + # min_count=1, iter=10, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) + + # train_data = LineSentence(train_file_path) + # model_gensim.build_vocab(train_data) + # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + + # # model_wrapper.accuracy("/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/test_analogy_questions-words.txt") + # # print + # # model_gensim.accuracy("/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/test_analogy_questions-words.txt") + + # def test_questions_task(self): + # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" + # train_file_path = "/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/lee_background.cor" + + # questions_file = '/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/questions-words.txt' + + # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, + # output_file=testfile(), model='skipgram', size=100, alpha=0.025, window=1, min_count=5, word_ngrams=1, + # loss='hs', sample=0, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=100, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, + # min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) + + # train_data = LineSentence(train_file_path) + # model_gensim.build_vocab(train_data) + # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + + # acc_w = model_wrapper.accuracy(questions_file) + + # sem_correct_w = sum((len(acc_w[i]['correct']) for i in range(5))) + # sem_total_w = sum((len(acc_w[i]['correct']) + len(acc_w[i]['incorrect'])) for i in range(5)) + # sem_acc_w = 100*float(sem_correct_w)/sem_total_w + # print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct_w, sem_total_w, sem_acc_w)) + + # syn_correct_w = sum((len(acc_w[i]['correct']) for i in range(5, len(acc_w)-1))) + # syn_total_w = sum((len(acc_w[i]['correct']) + len(acc_w[i]['incorrect'])) for i in range(5,len(acc_w)-1)) + # syn_acc_w = 100*float(syn_correct_w)/syn_total_w + # print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct_w, syn_total_w, syn_acc_w)) + + + # acc_g = model_gensim.accuracy(questions_file) + + # sem_correct_g = sum((len(acc_g[i]['correct']) for i in range(5))) + # sem_total_g = sum((len(acc_g[i]['correct']) + len(acc_g[i]['incorrect'])) for i in range(5)) + # sem_acc_g = 100*float(sem_correct_g)/sem_total_g + # print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct_g, sem_total_g, sem_acc_g)) - self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - def test_sg_hs(self): - if self.ft_exec_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return - - model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, - min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=12, min_alpha=0.0) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - def test_sg_neg(self): - if self.ft_exec_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return - - model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - loss='ns', sample=1e-3, negative=15, iter=1, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, - min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - orig0 = np.copy(model_gensim.wv.syn0[0]) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - def testModelPersistence(self): - model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, - min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - sorted_vocab=1, workers=1, min_alpha=0.0) - - lee_data = LineSentence(datapath('lee_background.cor')) - model_gensim.build_vocab(lee_data) - model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - - model_gensim.save(testfile()) - model_gensim_load = FT_gensim.load(testfile()) - most_similar_words = model_gensim_load.most_similar('night', topn=10) - self.assertTrue(len(most_similar_words) == 10) + # syn_correct_g = sum((len(acc_g[i]['correct']) for i in range(5, len(acc_g)-1))) + # syn_total_g = sum((len(acc_g[i]['correct']) + len(acc_g[i]['incorrect'])) for i in range(5,len(acc_g)-1)) + # syn_acc_g = 100*float(syn_correct_g)/syn_total_g + # print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct_g, syn_total_g, syn_acc_g)) + # # return (sem_acc_g, syn_acc_g) + + # def test_debug(self): + # train_data_file = datapath('lee_background_small.cor') + # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=train_data_file, + # output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=1, min_count=1, word_ngrams=1, + # loss='hs', sample=0, negative=0, iter=1, min_n=3, max_n=6, sorted_vocab=1, threads=1) + + # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, + # min_count=1, iter=1, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, + # sorted_vocab=1, workers=1, min_alpha=0.0) + # lee_data = LineSentence(train_data_file) + # model_gensim.build_vocab(lee_data) + # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + + # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 5a1829736a852f7bcd43f1e6b3f7f524ac6321cf Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 10:40:42 +0530 Subject: [PATCH 14/32] cleaned main fasttext code --- gensim/models/fasttext.py | 144 +++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 81 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 3fb407c794..4e9cea863c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -19,18 +19,15 @@ MAX_WORDS_IN_BATCH = 10000 - def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab ]#and - # model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - # start = max(0, pos - model.window + reduced_window) - start = max(0, pos - model.window) - # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) + reduced_window = model.random.randint(model.window) + start = max(0, pos - model.window + reduced_window) + window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] @@ -78,8 +75,6 @@ def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors if learn_vectors: # learn input -> hidden, here for all words in the window separately - if not model.cbow_mean and input_subword_indices: - neu1e /= len(input_subword_indices) for i in input_subword_indices: model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] @@ -91,26 +86,18 @@ def train_batch_sg(model, sentences, alpha, work=None): word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] #and # model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): - # reduced_window = model.random.randint(model.window) # `b` in the original word2vec code + reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn - # start = max(0, pos - model.window + reduced_window) - start = max(0, pos - model.window) - # for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): + start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] - # print("word2_subwords: ", word2_subwords) + for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1)], start): + for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself - # print("sending pair : ", model.wv.index2word[word2.index], " , ", model.wv.index2word[word.index]) - # subwords_indices = [word2.index] - # word2_subwords = model.wv.ngrams_word[model.wv.index2word[word2.index]] - - - # train_sg_pair(model, model.wv.index2word[word.index], subwords_indices, alpha) train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha) result += len(word_vocabs) @@ -156,9 +143,6 @@ def train_sg_pair(model, word, input_subword_indices, alpha, learn_vectors=True, neu1e += dot(gb, l2b) # save error if learn_vectors: - # l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) - if input_subword_indices: - neu1e /= len(input_subword_indices) for i in input_subword_indices: model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] @@ -176,6 +160,7 @@ def __init__( self.initialize_ngram_vectors() + # params common between Word2Vec and fastText self.sg = int(sg) self.cum_table = None # for negative sampling self.vector_size = int(size) @@ -204,16 +189,16 @@ def __init__( self.batch_words = batch_words self.model_trimmed_post_training = False + # fastText specific params self.bucket = bucket - self.loss = loss # should we keep this? -> we already have `hs`, `negative` -> although we don't have a mode for only `softmax` self.word_ngrams = word_ngrams self.min_n = min_n self.max_n = max_n if self.word_ngrams <= 1 and self.max_n == 0: self.bucket = 0 - self.wv.min_n = min_n self.wv.max_n = max_n + self.wv.ngrams_word = {} if sentences is not None: @@ -221,12 +206,60 @@ def __init__( raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) + start_alpha=self.alpha, end_alpha=self.min_alpha) else: if trim_rule is not None: logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") + def initialize_ngram_vectors(self): + self.wv = FastTextKeyedVectors() + + def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): + self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.finalize_vocab(update=update) # build tables & arrays + self.init_ngrams() + + def init_ngrams(self): + self.wv.ngrams = {} + self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + + all_ngrams = [] + for w, v in self.wv.vocab.items(): + self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + all_ngrams += self.wv.ngrams_word[w] + + all_ngrams = list(set(all_ngrams)) + self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) + logger.info("Total number of ngrams is %d", self.num_ngram_vectors) + + ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact + for i, ngram in enumerate(all_ngrams): + ngram_hash = Ft_Wrapper.ft_hash(ngram) + ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) + self.wv.ngrams[ngram] = i + len(self.wv.vocab) + + self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) + self.reset_ngram_weights() + + def reset_ngram_weights(self): + rand_obj = np.random + rand_obj.seed(self.seed) + for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): + self.wv.syn0_all[index] = rand_obj.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) + + def _do_train_job(self, sentences, alpha, inits): + work, neu1 = inits + tally = 0 + if self.sg: + tally += train_batch_sg(self, sentences, alpha, work) + else: + tally += train_batch_cbow(self, sentences, alpha, work, neu1) + + return tally, self._raw_word_count(sentences) + def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0): @@ -237,12 +270,9 @@ def train(self, sentences, total_examples=None, total_words=None, self.neg_labels[0] = 1. Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) + start_alpha=self.alpha, end_alpha=self.min_alpha) self.get_vocab_word_vecs() - def initialize_ngram_vectors(self): - self.wv = FastTextKeyedVectors() - def __getitem__(self, word): return self.word_vec(word) @@ -263,7 +293,7 @@ def word_vec(self, word, use_norm=False): else: return self.wv.syn0[self.wv.vocab[word].index] else: - logger.info("out of vocab") + logger.info("Word is out of vocabulary") word_vec = np.zeros(self.wv.syn0_all.shape[1]) ngrams = Ft_Wrapper.compute_ngrams(word, self.min_n, self.max_n) ngrams = [ng for ng in ngrams if ng in self.wv.ngrams] @@ -278,58 +308,10 @@ def word_vec(self, word, use_norm=False): else: # No ngrams of the word are present in self.ngrams raise KeyError('all ngrams for word %s absent from model' % word) - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling - self.finalize_vocab(update=update) # build tables & arrays - self.init_ngrams() - - def reset_ngram_weights(self): - rand_obj = np.random - rand_obj.seed(self.seed) - for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): - self.wv.syn0_all[index] = rand_obj.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) - - def init_ngrams(self): - self.wv.ngrams = {} - self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) - self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) - - all_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) - all_ngrams += self.wv.ngrams_word[w] - - all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) - logger.info("Total number of ngrams in the vocab is %d", self.num_ngram_vectors) - - ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact - - for i, ngram in enumerate(all_ngrams): - ngram_hash = Ft_Wrapper.ft_hash(ngram) - ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.ngrams[ngram] = i + len(self.wv.vocab) - - self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) - self.reset_ngram_weights() - - def _do_train_job(self, sentences, alpha, inits): - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1) - - return tally, self._raw_word_count(sentences) - @classmethod def load_fasttext_format(cls, *args, **kwargs): return Ft_Wrapper.load_fasttext_format(*args, **kwargs) def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors, recalculable table kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm']) - super(FastText, self).save(*args, **kwargs) From 4b987223ff348501ce28a47614c1599d03944d99 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 10:41:30 +0530 Subject: [PATCH 15/32] updated unittests --- gensim/test/test_fasttext.py | 429 +++++++++++------------------------ 1 file changed, 137 insertions(+), 292 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index d1185c793c..8344e36aff 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -12,7 +12,7 @@ from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim from gensim.models.wrappers.fasttext import FastTextKeyedVectors -# from gensim.models.wrappers.fasttext import FastText as FT_wrapper +from gensim.models.wrappers.fasttext import FastText as FT_wrapper module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -55,17 +55,16 @@ def testfile(): class TestFastTextModel(unittest.TestCase): def setUp(self): + ft_home = os.environ.get('FT_HOME', None) + self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') self.test_model = FT_gensim.load(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new') - # ft_home = os.environ.get('FT_HOME', None) - # self.ft_exec_path = os.path.join(ft_home, 'fasttext') if ft_home else None - # self.ft_exec_path = '/home/chinmaya/GSOC/Gensim/fastText/fasttext' - def testTraining(self): + def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0) model.build_vocab(sentences) - self.modelSanity(model) + self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) @@ -73,7 +72,7 @@ def testTraining(self): self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_all.shape[1], 10) - self.modelSanity(model) + self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] @@ -83,66 +82,9 @@ def testTraining(self): # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0) - self.modelsEqual(model, model2) - - def testOnlineLearning(self): - model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) - model_neg = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=0, negative=5) - self.assertTrue(len(model_hs.wv.vocab), 12) - self.assertTrue(model_hs.wv.vocab['graph'].count, 3) - model_hs.build_vocab(new_sentences, update=True) - model_neg.build_vocab(new_sentences, update=True) - self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) - self.assertEqual(len(model_hs.wv.vocab), 14) - self.assertEqual(len(model_neg.wv.vocab), 14) - - - def testOnlineLearningAfterSave(self): - model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) - model_neg.save(testfile()) - model_neg = FT_gensim.load(testfile()) - self.assertTrue(len(model_neg.wv.vocab), 12) - model_neg.build_vocab(new_sentences, update=True) - model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) - self.assertEqual(len(model_neg.wv.vocab), 14) - - def onlineSanity(self, model): - terro, others = [], [] - for l in list_corpus: - if 'terrorism' in l: - terro.append(l) - else: - others.append(l) - self.assertTrue(all(['terrorism' not in l for l in others])) - model.build_vocab(others) - model.train(others, total_examples=model.corpus_count, epochs=model.iter) - self.assertFalse('terrorism' in model.wv.vocab) - model.build_vocab(terro, update=True) - self.assertTrue('terrorism' in model.wv.vocab) - orig0_all = np.copy(model.wv.syn0_all) - model.train(terro, total_examples=len(terro), epochs=model.iter) - self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) - sim = model.n_similarity(['war'], ['terrorism']) - self.assertLess(0., sim) - - def test_sg_hs_online(self): - model = FT_gensim(sg=1, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) - self.onlineSanity(model) - - def test_sg_neg_online(self): - model = FT_gensim(sg=1, window=4, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) - self.onlineSanity(model) - - def test_cbow_hs_online(self): - model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) - self.onlineSanity(model) - - def test_cbow_neg_online(self): - model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) - self.onlineSanity(model) - - def modelsEqual(self, model, model2): + self.models_equal(model, model2) + + def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) @@ -154,10 +96,10 @@ def modelsEqual(self, model, model2): most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) - def testPersistence(self): + def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) - self.modelsEqual(model, FT_gensim.load(testfile())) + self.models_equal(model, FT_gensim.load(testfile())) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(testfile()) @@ -166,7 +108,7 @@ def testPersistence(self): self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams)) - def testNormVectorsNotSaved(self): + def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() model.save(testfile()) @@ -180,11 +122,11 @@ def testNormVectorsNotSaved(self): self.assertTrue(loaded_kv.syn0norm is None) self.assertTrue(loaded_kv.syn0_all_norm is None) - def modelSanity(self, model): + def model_sanity(self, model): self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model.vector_size)) - def testLoadFastTextFormat(self): + def test_load_fasttext_format(self): try: model = FT_gensim.load_fasttext_format(self.test_model_file) except Exception as exc: @@ -232,9 +174,9 @@ def testLoadFastTextFormat(self): self.assertEquals(model.bucket, 1000) self.assertEquals(model.wv.max_n, 6) self.assertEquals(model.wv.min_n, 3) - self.modelSanity(model) + self.model_sanity(model) - def testLoadFastTextNewFormat(self): + def test_load_fasttext_new_format(self): try: new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) except Exception as exc: @@ -286,7 +228,7 @@ def testLoadFastTextNewFormat(self): self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, new_model.vector_size)) # self.modelSanity(new_model) - def testLoadModelWithNonAsciiVocab(self): + def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: @@ -294,7 +236,7 @@ def testLoadModelWithNonAsciiVocab(self): except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word') - def testLoadModelNonUtf8Encoding(self): + def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: @@ -302,7 +244,7 @@ def testLoadModelNonUtf8Encoding(self): except KeyError: self.fail('Unable to access vector for cp-852 word') - def testNSimilarity(self): + def test_n_similarity(self): # In vocab, sanity check self.assertTrue(np.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) @@ -310,7 +252,7 @@ def testNSimilarity(self): self.assertTrue(np.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) self.assertEqual(self.test_model.n_similarity(['night'], ['nights']), self.test_model.n_similarity(['nights'], ['night'])) - def testSimilarity(self): + def test_similarity(self): # In vocab, sanity check self.assertTrue(np.allclose(self.test_model.similarity('the', 'the'), 1.0)) self.assertEqual(self.test_model.similarity('the', 'and'), self.test_model.similarity('and', 'the')) @@ -318,7 +260,7 @@ def testSimilarity(self): self.assertTrue(np.allclose(self.test_model.similarity('nights', 'nights'), 1.0)) self.assertEqual(self.test_model.similarity('night', 'nights'), self.test_model.similarity('nights', 'night')) - def testMostSimilar(self): + def test_most_similar(self): # In vocab, sanity check self.assertEqual(len(self.test_model.most_similar(positive=['the', 'and'], topn=5)), 5) self.assertEqual(self.test_model.most_similar('the'), self.test_model.most_similar(positive=['the'])) @@ -327,7 +269,7 @@ def testMostSimilar(self): self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights'])) - def testMostSimilarCosmul(self): + def test_most_similar_cosmul(self): # In vocab, sanity check self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5) self.assertEqual( @@ -339,7 +281,7 @@ def testMostSimilarCosmul(self): self.test_model.most_similar_cosmul('nights'), self.test_model.most_similar_cosmul(positive=['nights'])) - def testLookup(self): + def test_lookup(self): # In vocab, sanity check self.assertTrue('night' in self.test_model.wv.vocab) self.assertTrue(np.allclose(self.test_model['night'], self.test_model[['night']])) @@ -349,7 +291,7 @@ def testLookup(self): # Word with no ngrams in model self.assertRaises(KeyError, lambda: self.test_model['a!@']) - def testContains(self): + def test_contains(self): # In vocab, sanity check self.assertTrue('night' in self.test_model.wv.vocab) self.assertTrue('night' in self.test_model) @@ -360,7 +302,7 @@ def testContains(self): self.assertFalse('a!@' in self.test_model.wv.vocab) self.assertFalse('a!@' in self.test_model) - def testWmdistance(self): + def test_wm_distance(self): doc = ['night', 'payment'] oov_doc = ['nights', 'forests', 'payments'] ngrams_absent_doc = ['a!@', 'b#$'] @@ -370,7 +312,7 @@ def testWmdistance(self): dist = self.test_model.wmdistance(doc, ngrams_absent_doc) self.assertEqual(float('inf'), dist) - def testDoesntMatch(self): + def test_doesnt_match(self): oov_words = ['nights', 'forests', 'payments'] # Out of vocab check for word in oov_words: @@ -380,214 +322,117 @@ def testDoesntMatch(self): except Exception: self.fail('model.doesnt_match raises exception for oov words') - # def test_against_fasttext_wrapper(self, model_gensim, model_wrapper): - # sims_gensim = model_gensim.most_similar('night', topn=10) - # sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) - - # sims_wrapper = model_wrapper.most_similar('night', topn=10) - # sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) - - # print(sims_gensim) - # print - # print(sims_wrapper) - # print - # print(sims_gensim_words) - # print - # print(sims_wrapper_words) - # print - # print(set(sims_gensim_words).intersection(sims_wrapper_words)) - # print - - # self.assertEqual(sims_gensim, sims_wrapper) - - # def test_cbow_hs(self): - # if self.ft_exec_path is None: - # logger.info("FT_HOME env variable not set, skipping test") - # return - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), - # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - # loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, - # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # sorted_vocab=1, workers=12, min_alpha=0.0) - - # lee_data = LineSentence(datapath('lee_background.cor')) - # model_gensim.build_vocab(lee_data) - # orig0 = np.copy(model_gensim.wv.syn0[0]) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - # def test_cbow_neg(self): - # # if self.ft_exec_path is None: - # # logger.info("FT_HOME env variable not set, skipping test") - # # return - # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, loss='ns', - # sample=1e-3, negative=15, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=15, - # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - - # lee_data = LineSentence(datapath('lee_background.cor')) - # model_gensim.build_vocab(lee_data) - # orig0 = np.copy(model_gensim.wv.syn0[0]) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - # def test_sg_hs(self): - # if self.ft_exec_path is None: - # logger.info("FT_HOME env variable not set, skipping test") - # return - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - # output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - # loss='hs', sample=1e-3, negative=0, iter=3, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, - # min_count=5, iter=3, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # sorted_vocab=1, workers=12, min_alpha=0.0) - - # lee_data = LineSentence(datapath('lee_background.cor')) - # model_gensim.build_vocab(lee_data) - # orig0 = np.copy(model_gensim.wv.syn0[0]) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - # def test_sg_neg(self): - # # if self.ft_exec_path is None: - # # logger.info("FT_HOME env variable not set, skipping test") - # # return - # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=datapath('lee_background.cor'), - # output_file=testfile(), model='skipgram', size=50, alpha=0.05, window=2, min_count=5, word_ngrams=1, - # loss='ns', sample=1e-3, negative=15, iter=10, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, - # min_count=5, iter=10, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - - # lee_data = LineSentence(datapath('lee_background.cor')) - # model_gensim.build_vocab(lee_data) - # orig0 = np.copy(model_gensim.wv.syn0[0]) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - # self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - # def testModelPersistence(self): - # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=0, - # min_count=5, iter=1, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - - # lee_data = LineSentence(datapath('lee_background.cor')) - # model_gensim.build_vocab(lee_data) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - - # model_gensim.save(testfile()) - # model_gensim_load = FT_gensim.load(testfile()) - # most_similar_words = model_gensim_load.most_similar('night', topn=10) - # self.assertTrue(len(most_similar_words) == 10) - - # def test_sample(self): - # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" - # # train_file_path = "/home/chinmaya/GSOC/Gensim/text8_100000" - # train_file_path = "/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/lee_background.cor" - # # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, - # # output_file=testfile(), model='cbow', size=50, alpha=0.05, window=5, min_count=1, word_ngrams=1, - # # loss='ns', sample=1e-3, negative=15, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) - - # # model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - # # min_count=1, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, - # # sorted_vocab=1, workers=12, min_alpha=0.0) - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, - # output_file=testfile(), model='skipgram', size=100, alpha=0.025, window=1, min_count=1, word_ngrams=1, - # loss='hs', sample=0, negative=0, iter=10, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=100, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, - # min_count=1, iter=10, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - - # train_data = LineSentence(train_file_path) - # model_gensim.build_vocab(train_data) - # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) - - # # model_wrapper.accuracy("/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/test_analogy_questions-words.txt") - # # print - # # model_gensim.accuracy("/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/test_analogy_questions-words.txt") - - # def test_questions_task(self): - # self.ft_path = "/home/chinmaya/GSOC/Gensim/fastText/fasttext" - # train_file_path = "/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/lee_background.cor" - - # questions_file = '/home/chinmaya/GSOC/Gensim/gensim/gensim/test/test_data/questions-words.txt' - - # model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=train_file_path, - # output_file=testfile(), model='skipgram', size=100, alpha=0.025, window=1, min_count=5, word_ngrams=1, - # loss='hs', sample=0, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=100, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, - # min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - - # train_data = LineSentence(train_file_path) - # model_gensim.build_vocab(train_data) - # model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - - # acc_w = model_wrapper.accuracy(questions_file) - - # sem_correct_w = sum((len(acc_w[i]['correct']) for i in range(5))) - # sem_total_w = sum((len(acc_w[i]['correct']) + len(acc_w[i]['incorrect'])) for i in range(5)) - # sem_acc_w = 100*float(sem_correct_w)/sem_total_w - # print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct_w, sem_total_w, sem_acc_w)) - - # syn_correct_w = sum((len(acc_w[i]['correct']) for i in range(5, len(acc_w)-1))) - # syn_total_w = sum((len(acc_w[i]['correct']) + len(acc_w[i]['incorrect'])) for i in range(5,len(acc_w)-1)) - # syn_acc_w = 100*float(syn_correct_w)/syn_total_w - # print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct_w, syn_total_w, syn_acc_w)) - - - # acc_g = model_gensim.accuracy(questions_file) - - # sem_correct_g = sum((len(acc_g[i]['correct']) for i in range(5))) - # sem_total_g = sum((len(acc_g[i]['correct']) + len(acc_g[i]['incorrect'])) for i in range(5)) - # sem_acc_g = 100*float(sem_correct_g)/sem_total_g - # print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct_g, sem_total_g, sem_acc_g)) - - # syn_correct_g = sum((len(acc_g[i]['correct']) for i in range(5, len(acc_g)-1))) - # syn_total_g = sum((len(acc_g[i]['correct']) + len(acc_g[i]['incorrect'])) for i in range(5,len(acc_g)-1)) - # syn_acc_g = 100*float(syn_correct_g)/syn_total_g - # print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct_g, syn_total_g, syn_acc_g)) - # # return (sem_acc_g, syn_acc_g) - - # def test_debug(self): - # train_data_file = datapath('lee_background_small.cor') - # model_wrapper = FT_wrapper.train(ft_path=self.ft_exec_path, corpus_file=train_data_file, - # output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=1, min_count=1, word_ngrams=1, - # loss='hs', sample=0, negative=0, iter=1, min_n=3, max_n=6, sorted_vocab=1, threads=1) - - # model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=1, hs=1, negative=0, - # min_count=1, iter=1, batch_words=1000, word_ngrams=1, sample=0, min_n=3, max_n=6, - # sorted_vocab=1, workers=1, min_alpha=0.0) - # lee_data = LineSentence(train_data_file) - # model_gensim.build_vocab(lee_data) - # model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) - - # self.test_against_fasttext_wrapper(model_gensim, model_wrapper) + def compare_with_wrapper(self, model_gensim, model_wrapper): + # make sure we get >=3 overlapping words for top-10 similar words suggested for `night` + sims_gensim = model_gensim.most_similar('night', topn=10) + sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) # get similar words + + sims_wrapper = model_wrapper.most_similar('night', topn=10) + sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) # get similar words + + overlap_count = len(set(sims_gensim_words).intersection(sims_wrapper_words)) + + # overlap increases as we increase `iter` value, min overlap set to 2 to avoid unit-tests taking too long + # this limit can be increased when using Cython code + self.assertGreaterEqual(overlap_count, 2) + + def test_cbow_hs_against_wrapper(self): + if self.ft_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='cbow', size=50, alpha=0.05, window=5, min_count=5, word_ngrams=1, + loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + + model_gensim = FT_gensim(size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + self.compare_with_wrapper(model_gensim, model_wrapper) + + def test_sg_hs_against_wrapper(self): + if self.ft_path is None: + logger.info("FT_HOME env variable not set, skipping test") + return + + model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), + output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, + loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) + + model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + sorted_vocab=1, workers=12, min_alpha=0.0) + + lee_data = LineSentence(datapath('lee_background.cor')) + model_gensim.build_vocab(lee_data) + orig0 = np.copy(model_gensim.wv.syn0[0]) + model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) + self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training + self.compare_with_wrapper(model_gensim, model_wrapper) + + # def test_online_learning(self): + # model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) + # model_neg = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=0, negative=5) + # self.assertTrue(len(model_hs.wv.vocab), 12) + # self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + # model_hs.build_vocab(new_sentences, update=True) + # model_neg.build_vocab(new_sentences, update=True) + # self.assertTrue(model_hs.wv.vocab['graph'].count, 4) + # self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + # self.assertEqual(len(model_hs.wv.vocab), 14) + # self.assertEqual(len(model_neg.wv.vocab), 14) + + + # def test_online_learning_after_save(self): + # model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + # model_neg.save(testfile()) + # model_neg = FT_gensim.load(testfile()) + # self.assertTrue(len(model_neg.wv.vocab), 12) + # model_neg.build_vocab(new_sentences, update=True) + # model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) + # self.assertEqual(len(model_neg.wv.vocab), 14) + + # def online_sanity(self, model): + # terro, others = [], [] + # for l in list_corpus: + # if 'terrorism' in l: + # terro.append(l) + # else: + # others.append(l) + # self.assertTrue(all(['terrorism' not in l for l in others])) + # model.build_vocab(others) + # model.train(others, total_examples=model.corpus_count, epochs=model.iter) + # self.assertFalse('terrorism' in model.wv.vocab) + # model.build_vocab(terro, update=True) + # self.assertTrue('terrorism' in model.wv.vocab) + # orig0_all = np.copy(model.wv.syn0_all) + # model.train(terro, total_examples=len(terro), epochs=model.iter) + # self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) + # sim = model.n_similarity(['war'], ['terrorism']) + # self.assertLess(0., sim) + + # def test_sg_hs_online(self): + # model = FT_gensim(sg=1, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + # self.online_sanity(model) + + # def test_sg_neg_online(self): + # model = FT_gensim(sg=1, window=4, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) + # self.online_sanity(model) + + # def test_cbow_hs_online(self): + # model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + # self.online_sanity(model) + + # def test_cbow_neg_online(self): + # model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) + # self.online_sanity(model) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From cf1f3e0a832d5532c5e0f0c38e3ece29279cf12e Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 11:58:38 +0530 Subject: [PATCH 16/32] removed EOS token from LineSentence --- gensim/models/word2vec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ff6aedf563..b5b60361b1 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1539,7 +1539,6 @@ def __iter__(self): self.source.seek(0) for line in itertools.islice(self.source, self.limit): line = utils.to_unicode(line).split() - line.append('/s') i = 0 while i < len(line): yield line[i : i + self.max_sentence_length] @@ -1549,7 +1548,6 @@ def __iter__(self): with utils.smart_open(self.source) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() - line.append('/s') i = 0 while i < len(line): yield line[i:i + self.max_sentence_length] From d986242f6813fea7e620c9c06350954775eac751 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 12:00:04 +0530 Subject: [PATCH 17/32] fixed flake8 errors --- gensim/models/fasttext.py | 19 +++++++++++-------- gensim/test/test_fasttext.py | 13 +++++++------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4e9cea863c..0ee68b5c9d 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -2,14 +2,13 @@ # -*- coding: utf-8 -*- import logging - + from types import GeneratorType from copy import deepcopy -from six import string_types import numpy as np -from numpy import dot, zeros, ones, vstack, outer, random, sum as np_sum, empty, float32 as REAL +from numpy import dot, zeros, ones, outer, random, sum as np_sum, empty, float32 as REAL from scipy.special import expit - + from gensim.utils import call_on_class_only from gensim.models.word2vec import Word2Vec from gensim.models.wrappers.fasttext import FastTextKeyedVectors @@ -19,6 +18,7 @@ MAX_WORDS_IN_BATCH = 10000 + def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result = 0 for sentence in sentences: @@ -48,6 +48,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): result += len(word_vocabs) return result + def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors=True, learn_hidden=True): neu1e = zeros(l1.shape) @@ -80,11 +81,12 @@ def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors return neu1e + def train_batch_sg(model, sentences, alpha, work=None): result = 0 for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] #and - # model.wv.vocab[w].sample_int > model.random.rand() * 2**32] + word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and + model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn @@ -103,6 +105,7 @@ def train_batch_sg(model, sentences, alpha, work=None): result += len(word_vocabs) return result + def train_sg_pair(model, word, input_subword_indices, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None): if context_vectors is None: context_vectors = model.wv.syn0_all @@ -240,7 +243,7 @@ def init_ngrams(self): ngram_hash = Ft_Wrapper.ft_hash(ngram) ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) self.wv.ngrams[ngram] = i + len(self.wv.vocab) - + self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) self.reset_ngram_weights() @@ -248,7 +251,7 @@ def reset_ngram_weights(self): rand_obj = np.random rand_obj.seed(self.seed) for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): - self.wv.syn0_all[index] = rand_obj.uniform(-1.0/self.vector_size, 1.0/self.vector_size, self.vector_size) + self.wv.syn0_all[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 8344e36aff..76b8770a45 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -18,12 +18,14 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) logger = logging.getLogger(__name__) + class LeeCorpus(object): def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line) + list_corpus = list(LeeCorpus()) sentences = [ @@ -47,6 +49,7 @@ def __iter__(self): ['artificial', 'intelligence', 'system'] ] + def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_fasttext.tst') @@ -232,7 +235,7 @@ def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model) try: - vector = model[u'který'] + model[u'který'] except UnicodeDecodeError: self.fail('Unable to access vector for utf8 encoded non-ascii word') @@ -240,7 +243,7 @@ def test_load_model_non_utf8_encoding(self): model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model) try: - vector = model[u'který'] + model[u'který'] except KeyError: self.fail('Unable to access vector for cp-852 word') @@ -268,7 +271,6 @@ def test_most_similar(self): self.assertEqual(len(self.test_model.most_similar(['night', 'nights'], topn=5)), 5) self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights'])) - def test_most_similar_cosmul(self): # In vocab, sanity check self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5) @@ -325,10 +327,10 @@ def test_doesnt_match(self): def compare_with_wrapper(self, model_gensim, model_wrapper): # make sure we get >=3 overlapping words for top-10 similar words suggested for `night` sims_gensim = model_gensim.most_similar('night', topn=10) - sims_gensim_words = (list(map(lambda x:x[0], sims_gensim))) # get similar words + sims_gensim_words = (list(map(lambda x : x[0], sims_gensim))) # get similar words sims_wrapper = model_wrapper.most_similar('night', topn=10) - sims_wrapper_words = (list(map(lambda x:x[0], sims_wrapper))) # get similar words + sims_wrapper_words = (list(map(lambda x : x[0], sims_wrapper))) # get similar words overlap_count = len(set(sims_gensim_words).intersection(sims_wrapper_words)) @@ -388,7 +390,6 @@ def test_sg_hs_against_wrapper(self): # self.assertEqual(len(model_hs.wv.vocab), 14) # self.assertEqual(len(model_neg.wv.vocab), 14) - # def test_online_learning_after_save(self): # model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) # model_neg.save(testfile()) From bce17ff4216d19d5c079c1669f1c7a6fa5851995 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 21:27:27 +0530 Subject: [PATCH 18/32] [WIP] added online learning --- gensim/models/fasttext.py | 89 +++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 23 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 0ee68b5c9d..79a9da2664 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -220,32 +220,75 @@ def initialize_ngram_vectors(self): def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey + if update: + if not len(self.wv.vocab): + raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \ + "First build the vocabulary of your model with a corpus " \ + "before doing an online update.") + self.old_vocab_len = len(self.wv.vocab) + self.old_hash2index_len = len(self.wv.hash2index) self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays - self.init_ngrams() + self.init_ngrams(update=update) + + def init_ngrams(self, update=False): + if not update: + self.wv.ngrams = {} + self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + + all_ngrams = [] + for w, v in self.wv.vocab.items(): + self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + all_ngrams += self.wv.ngrams_word[w] + + all_ngrams = list(set(all_ngrams)) + self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) + logger.info("Total number of ngrams is %d", self.num_ngram_vectors) + + self.wv.hash2index = {} + ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact + for i, ngram in enumerate(all_ngrams): + ngram_hash = Ft_Wrapper.ft_hash(ngram) + if ngram_hash in self.wv.hash2index: + self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + else: + ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) + self.wv.hash2index[ngram_hash] = i + len(self.wv.vocab) + self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + + self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) + self.reset_ngram_weights() - def init_ngrams(self): - self.wv.ngrams = {} - self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) - self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) - - all_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) - all_ngrams += self.wv.ngrams_word[w] - - all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) - logger.info("Total number of ngrams is %d", self.num_ngram_vectors) - - ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact - for i, ngram in enumerate(all_ngrams): - ngram_hash = Ft_Wrapper.ft_hash(ngram) - ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.ngrams[ngram] = i + len(self.wv.vocab) - - self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) - self.reset_ngram_weights() + else: + new_vocab_len = len(self.wv.vocab) + for ngram, idx in self.wv.hash2index.items(): + self.wv.hash2index[ngram] = idx + new_vocab_len - self.old_vocab_len + + new_ngrams = [] + for w, v in self.wv.vocab.items(): + self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] + + new_ngrams = list(set(new_ngrams)) + new_count = 0 + for i, ngram in enumerate(new_ngrams): + ngram_hash = Ft_Wrapper.ft_hash(ngram) + if ngram_hash not in self.wv.hash2index: + self.wv.hash2index[ngram_hash] = new_count + len(self.wv.vocab) + self.old_hash2index_len + self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + else: + self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + + old_vocab_rows = self.wv.syn0_all[0:self.old_vocab_len, ] + old_ngram_rows = self.syn0_all[self.old_vocab_len:, ] + + rand_obj = np.random + rand_obj.seed(self.seed) + new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) + new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) + + self.wv.syn0_all = vstack([old_vocab_rows, new_vocab_rows, old_ngram_rows, new_ngram_rows]) def reset_ngram_weights(self): rand_obj = np.random From cb840011766854404655f8d2b406566b422506de Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Fri, 25 Aug 2017 22:24:56 +0530 Subject: [PATCH 19/32] added tests for online learning --- gensim/models/fasttext.py | 8 +-- gensim/test/test_fasttext.py | 121 ++++++++++++++++++----------------- 2 files changed, 67 insertions(+), 62 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 79a9da2664..15b9f15798 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -6,7 +6,7 @@ from types import GeneratorType from copy import deepcopy import numpy as np -from numpy import dot, zeros, ones, outer, random, sum as np_sum, empty, float32 as REAL +from numpy import dot, zeros, ones, vstack, outer, random, sum as np_sum, empty, float32 as REAL from scipy.special import expit from gensim.utils import call_on_class_only @@ -244,7 +244,7 @@ def init_ngrams(self, update=False): all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) - logger.info("Total number of ngrams is %d", self.num_ngram_vectors) + logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact @@ -259,7 +259,6 @@ def init_ngrams(self, update=False): self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) self.reset_ngram_weights() - else: new_vocab_len = len(self.wv.vocab) for ngram, idx in self.wv.hash2index.items(): @@ -271,6 +270,7 @@ def init_ngrams(self, update=False): new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] new_ngrams = list(set(new_ngrams)) + logger.info("Number of new ngrams is %d", len(new_ngrams)) new_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = Ft_Wrapper.ft_hash(ngram) @@ -281,7 +281,7 @@ def init_ngrams(self, update=False): self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] old_vocab_rows = self.wv.syn0_all[0:self.old_vocab_len, ] - old_ngram_rows = self.syn0_all[self.old_vocab_len:, ] + old_ngram_rows = self.wv.syn0_all[self.old_vocab_len:, ] rand_obj = np.random rand_obj.seed(self.seed) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 76b8770a45..1db8f12aa4 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -325,12 +325,12 @@ def test_doesnt_match(self): self.fail('model.doesnt_match raises exception for oov words') def compare_with_wrapper(self, model_gensim, model_wrapper): - # make sure we get >=3 overlapping words for top-10 similar words suggested for `night` + # make sure we get >=2 overlapping words for top-10 similar words suggested for `night` sims_gensim = model_gensim.most_similar('night', topn=10) - sims_gensim_words = (list(map(lambda x : x[0], sims_gensim))) # get similar words + sims_gensim_words = (list(map(lambda x: x[0], sims_gensim))) # get similar words sims_wrapper = model_wrapper.most_similar('night', topn=10) - sims_wrapper_words = (list(map(lambda x : x[0], sims_wrapper))) # get similar words + sims_wrapper_words = (list(map(lambda x: x[0], sims_wrapper))) # get similar words overlap_count = len(set(sims_gensim_words).intersection(sims_wrapper_words)) @@ -378,61 +378,66 @@ def test_sg_hs_against_wrapper(self): self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper) - # def test_online_learning(self): - # model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) - # model_neg = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=0, negative=5) - # self.assertTrue(len(model_hs.wv.vocab), 12) - # self.assertTrue(model_hs.wv.vocab['graph'].count, 3) - # model_hs.build_vocab(new_sentences, update=True) - # model_neg.build_vocab(new_sentences, update=True) - # self.assertTrue(model_hs.wv.vocab['graph'].count, 4) - # self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) - # self.assertEqual(len(model_hs.wv.vocab), 14) - # self.assertEqual(len(model_neg.wv.vocab), 14) - - # def test_online_learning_after_save(self): - # model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) - # model_neg.save(testfile()) - # model_neg = FT_gensim.load(testfile()) - # self.assertTrue(len(model_neg.wv.vocab), 12) - # model_neg.build_vocab(new_sentences, update=True) - # model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) - # self.assertEqual(len(model_neg.wv.vocab), 14) - - # def online_sanity(self, model): - # terro, others = [], [] - # for l in list_corpus: - # if 'terrorism' in l: - # terro.append(l) - # else: - # others.append(l) - # self.assertTrue(all(['terrorism' not in l for l in others])) - # model.build_vocab(others) - # model.train(others, total_examples=model.corpus_count, epochs=model.iter) - # self.assertFalse('terrorism' in model.wv.vocab) - # model.build_vocab(terro, update=True) - # self.assertTrue('terrorism' in model.wv.vocab) - # orig0_all = np.copy(model.wv.syn0_all) - # model.train(terro, total_examples=len(terro), epochs=model.iter) - # self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) - # sim = model.n_similarity(['war'], ['terrorism']) - # self.assertLess(0., sim) - - # def test_sg_hs_online(self): - # model = FT_gensim(sg=1, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) - # self.online_sanity(model) - - # def test_sg_neg_online(self): - # model = FT_gensim(sg=1, window=4, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) - # self.online_sanity(model) - - # def test_cbow_hs_online(self): - # model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) - # self.online_sanity(model) - - # def test_cbow_neg_online(self): - # model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) - # self.online_sanity(model) + def test_online_learning(self): + model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) + self.assertTrue(len(model_hs.wv.vocab), 12) + self.assertTrue(len(model_hs.wv.ngrams), 202) + self.assertTrue(model_hs.wv.vocab['graph'].count, 3) + self.assertFalse('tif' in model_hs.wv.ngrams) + model_hs.build_vocab(new_sentences, update=True) # update vocab + self.assertEqual(len(model_hs.wv.vocab), 14) + self.assertTrue(len(model_hs.wv.ngrams), 271) + self.assertTrue(model_hs.wv.vocab['graph'].count, 4) + self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertTrue('tif' in model_hs.wv.ngrams) # ngram added because of the word `artificial` + + def test_online_learning_after_save(self): + model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg.save(testfile()) + model_neg = FT_gensim.load(testfile()) + self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertTrue(len(model_hs.wv.ngrams), 202) + model_neg.build_vocab(new_sentences, update=True) # update vocab + model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) + self.assertEqual(len(model_neg.wv.vocab), 14) + self.assertTrue(len(model_hs.wv.ngrams), 271) + + def online_sanity(self, model): + terro, others = [], [] + for l in list_corpus: + if 'terrorism' in l: + terro.append(l) + else: + others.append(l) + self.assertTrue(all(['terrorism' not in l for l in others])) + model.build_vocab(others) + model.train(others, total_examples=model.corpus_count, epochs=model.iter) + self.assertFalse('terrorism' in model.wv.vocab) + self.assertFalse('orism>' in model.wv.ngrams) + model.build_vocab(terro, update=True) # update vocab + self.assertTrue('terrorism' in model.wv.vocab) + self.assertTrue('orism>' in model.wv.ngrams) + orig0_all = np.copy(model.wv.syn0_all) + model.train(terro, total_examples=len(terro), epochs=model.iter) + self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) + sim = model.n_similarity(['war'], ['terrorism']) + self.assertLess(0., sim) + + def test_sg_hs_online(self): + model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + self.online_sanity(model) + + def test_sg_neg_online(self): + model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) + self.online_sanity(model) + + def test_cbow_hs_online(self): + model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) + self.online_sanity(model) + + def test_cbow_neg_online(self): + model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) + self.online_sanity(model) if __name__ == '__main__': From 58c673a826534e3faf3d2eb34d92404cf9595976 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Sat, 26 Aug 2017 02:30:33 +0530 Subject: [PATCH 20/32] flake8 fixes --- gensim/models/fasttext.py | 4 ++-- gensim/test/test_fasttext.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 15b9f15798..7fb1160e2d 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -222,8 +222,8 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey if update: if not len(self.wv.vocab): - raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " \ - "First build the vocabulary of your model with a corpus " \ + raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus " "before doing an online update.") self.old_vocab_len = len(self.wv.vocab) self.old_hash2index_len = len(self.wv.hash2index) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 1db8f12aa4..afe34c159e 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -396,11 +396,11 @@ def test_online_learning_after_save(self): model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) - self.assertTrue(len(model_hs.wv.ngrams), 202) + self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) - self.assertTrue(len(model_hs.wv.ngrams), 271) + self.assertTrue(len(model_neg.wv.ngrams), 271) def online_sanity(self, model): terro, others = [], [] From 893ef7635b1e57cd3fcfbe0f1c82ae92152f1bfe Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 28 Aug 2017 03:34:27 +0530 Subject: [PATCH 21/32] refactored code to remove redundancy --- gensim/models/fasttext.py | 149 +++------------------------ gensim/models/word2vec.py | 49 +++++++-- gensim/models/wrappers/fasttext.py | 58 +++++------ gensim/test/test_fasttext_wrapper.py | 4 +- 4 files changed, 84 insertions(+), 176 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 7fb1160e2d..ca7cff6256 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -10,9 +10,9 @@ from scipy.special import expit from gensim.utils import call_on_class_only -from gensim.models.word2vec import Word2Vec +from gensim.models.word2vec import Word2Vec, train_sg_pair, train_cbow_pair from gensim.models.wrappers.fasttext import FastTextKeyedVectors -from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper +from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper, compute_ngrams, ft_hash logger = logging.getLogger(__name__) @@ -44,44 +44,11 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): if subwords_indices and model.cbow_mean: l1 /= len(subwords_indices) - train_cbow_pair(model, word, subwords_indices, l1, alpha) # train on the sliding window for target word + train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word result += len(word_vocabs) return result -def train_cbow_pair(model, word, input_subword_indices, l1, alpha, learn_vectors=True, learn_hidden=True): - neu1e = zeros(l1.shape) - - if model.hs: - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - fa = expit(dot(l1, l2a.T)) # propagate hidden -> output - ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [word.index] # through word index get all subwords indices (need to make the changes in code) - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - fb = expit(dot(l1, l2b.T)) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - if learn_vectors: - # learn input -> hidden, here for all words in the window separately - for i in input_subword_indices: - model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] - - return neu1e - - def train_batch_sg(model, sentences, alpha, work=None): result = 0 for sentence in sentences: @@ -100,58 +67,12 @@ def train_batch_sg(model, sentences, alpha, work=None): for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself - train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha) + train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result -def train_sg_pair(model, word, input_subword_indices, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None): - if context_vectors is None: - context_vectors = model.wv.syn0_all - if context_locks is None: - context_locks = model.syn0_all_lockf - - if word not in model.wv.vocab: - return - predict_word = model.wv.vocab[word] # target word (NN output) - - l1 = np_sum(context_vectors[input_subword_indices], axis=0) - if input_subword_indices: - l1 /= len(input_subword_indices) - - neu1e = zeros(l1.shape) - - if model.hs: - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size - fa = expit(dot(l1, l2a.T)) # propagate hidden -> output - ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [predict_word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != predict_word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - fb = expit(dot(l1, l2b.T)) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - if learn_vectors: - for i in input_subword_indices: - model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] - - return neu1e - - class FastText(Word2Vec): def __init__( self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, @@ -159,40 +80,8 @@ def __init__( negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): - self.load = call_on_class_only - + # fastText specific functions and params self.initialize_ngram_vectors() - - # params common between Word2Vec and fastText - self.sg = int(sg) - self.cum_table = None # for negative sampling - self.vector_size = int(size) - self.layer1_size = int(size) - if size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") - self.alpha = float(alpha) - self.min_alpha_yet_reached = float(alpha) # To warn user if alpha increases - self.window = int(window) - self.max_vocab_size = max_vocab_size - self.seed = seed - self.random = random.RandomState(seed) - self.min_count = min_count - self.sample = sample - self.workers = int(workers) - self.min_alpha = float(min_alpha) - self.hs = hs - self.negative = negative - self.cbow_mean = int(cbow_mean) - self.hashfxn = hashfxn - self.iter = iter - self.null_word = null_word - self.train_count = 0 - self.total_train_time = 0 - self.sorted_vocab = sorted_vocab - self.batch_words = batch_words - self.model_trimmed_post_training = False - - # fastText specific params self.bucket = bucket self.word_ngrams = word_ngrams self.min_n = min_n @@ -201,25 +90,17 @@ def __init__( self.bucket = 0 self.wv.min_n = min_n self.wv.max_n = max_n - self.wv.ngrams_word = {} - if sentences is not None: - if isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") - self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) - else: - if trim_rule is not None: - logger.warning("The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. ") - logger.warning("Model initialized without sentences. trim_rule provided, if any, will be ignored.") + super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, + max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, + sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, + trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words, init_wv=False) def initialize_ngram_vectors(self): self.wv = FastTextKeyedVectors() def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey if update: if not len(self.wv.vocab): raise RuntimeError("You cannot do an online vocabulary-update of a model which has no prior vocabulary. " @@ -227,8 +108,8 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ "before doing an online update.") self.old_vocab_len = len(self.wv.vocab) self.old_hash2index_len = len(self.wv.hash2index) - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling - self.finalize_vocab(update=update) # build tables & arrays + + super(FastText, self).build_vocab(sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) self.init_ngrams(update=update) def init_ngrams(self, update=False): @@ -239,7 +120,7 @@ def init_ngrams(self, update=False): all_ngrams = [] for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) @@ -249,7 +130,7 @@ def init_ngrams(self, update=False): self.wv.hash2index = {} ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact for i, ngram in enumerate(all_ngrams): - ngram_hash = Ft_Wrapper.ft_hash(ngram) + ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: @@ -266,14 +147,14 @@ def init_ngrams(self, update=False): new_ngrams = [] for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = Ft_Wrapper.compute_ngrams(w, self.min_n, self.max_n) + self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_count = 0 for i, ngram in enumerate(new_ngrams): - ngram_hash = Ft_Wrapper.ft_hash(ngram) + ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ngram_hash] = new_count + len(self.wv.vocab) + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b5b60361b1..4c674d35ca 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -254,18 +254,29 @@ def score_sentence_cbow(model, sentence, work=None, neu1=None): def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False): + context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): if context_vectors is None: - context_vectors = model.wv.syn0 + if is_ft: + context_vectors = model.wv.syn0_all + else: + context_vectors = model.wv.syn0 if context_locks is None: - context_locks = model.syn0_lockf + if is_ft: + context_locks = model.syn0_all_lockf + else: + context_locks = model.syn0_lockf if word not in model.wv.vocab: return predict_word = model.wv.vocab[word] # target word (NN output) - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] + if is_ft: + l1 = np_sum(context_vectors[context_index], axis=0) + if context_index: + l1 /= len(context_index) + else: + l1 = context_vectors[context_index] # input word (NN input/projection layer) + lock_factor = context_locks[context_index] neu1e = zeros(l1.shape) @@ -306,11 +317,27 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h model.running_training_loss -= log(expit(prod_term[0])) # for the output word if learn_vectors: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) + if is_ft: + for i in context_index: + model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] + else: + l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) return neu1e -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False): +def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False, + context_vectors=None, context_locks=None, is_ft=False): + if context_vectors is None: + if is_ft: + context_vectors = model.wv.syn0_all + else: + context_vectors = model.wv.syn0 + if context_locks is None: + if is_ft: + context_locks = model.syn0_all_lockf + else: + context_locks = model.syn0_lockf + neu1e = zeros(l1.shape) if model.hs: @@ -352,7 +379,7 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr if not model.cbow_mean and input_word_indices: neu1e /= len(input_word_indices) for i in input_word_indices: - model.wv.syn0[i] += neu1e * model.syn0_lockf[i] + context_vectors[i] += neu1e * context_locks[i] return neu1e @@ -389,7 +416,8 @@ def __init__( self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, + init_wv=True): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -467,7 +495,8 @@ def __init__( else: logger.debug('Fast version of {0} is being used'.format(__name__)) - self.initialize_word_vectors() + if init_wv: + self.initialize_word_vectors() self.sg = int(sg) self.cum_table = None # for negative sampling self.vector_size = int(size) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 9f68d67ca0..221b6c33b0 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -80,7 +80,7 @@ def word_vec(self, word, use_norm=False): return super(FastTextKeyedVectors, self).word_vec(word, use_norm) else: word_vec = np.zeros(self.syn0_all.shape[1]) - ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) + ngrams = compute_ngrams(word, self.min_n, self.max_n) ngrams = [ng for ng in ngrams if ng in self.ngrams] if use_norm: ngram_weights = self.syn0_all_norm @@ -123,7 +123,7 @@ def __contains__(self, word): if word in self.vocab: return True else: - word_ngrams = set(FastText.compute_ngrams(word, self.min_n, self.max_n)) + word_ngrams = set(compute_ngrams(word, self.min_n, self.max_n)) if len(word_ngrams & set(self.ngrams.keys())): return True else: @@ -364,14 +364,14 @@ def init_ngrams(self): self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL) for w, vocab in self.wv.vocab.items(): - all_ngrams += self.compute_ngrams(w, self.wv.min_n, self.wv.max_n) + all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n) self.wv.syn0[vocab.index] += np.array(self.wv.syn0_all[vocab.index]) all_ngrams = set(all_ngrams) self.num_ngram_vectors = len(all_ngrams) ngram_indices = [] for i, ngram in enumerate(all_ngrams): - ngram_hash = self.ft_hash(ngram) + ngram_hash = ft_hash(ngram) ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) self.wv.ngrams[ngram] = i self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) @@ -381,37 +381,35 @@ def init_ngrams(self): logger.info("loading weights for %s words for fastText model from %s", len(self.wv.vocab), self.file_name) for w, vocab in self.wv.vocab.items(): - word_ngrams = self.compute_ngrams(w, self.wv.min_n, self.wv.max_n) + word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n) for word_ngram in word_ngrams: self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]]) self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) logger.info("loaded %s weight matrix for fastText model from %s", self.wv.syn0.shape, self.file_name) - @staticmethod - def compute_ngrams(word, min_n, max_n): - ngram_indices = [] - BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix - extended_word = BOW + word + EOW - ngrams = [] - for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): - for i in range(0, len(extended_word) - ngram_length + 1): - ngrams.append(extended_word[i:i + ngram_length]) - return ngrams - - @staticmethod - def ft_hash(string): - """ - Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) - used in fastText. +def compute_ngrams(word, min_n, max_n): + ngram_indices = [] + BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix + extended_word = BOW + word + EOW + ngrams = [] + for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): + for i in range(0, len(extended_word) - ngram_length + 1): + ngrams.append(extended_word[i:i + ngram_length]) + return ngrams + +def ft_hash(string): + """ + Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) + used in fastText. - """ - # Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed. - old_settings = np.seterr(all='ignore') - h = np.uint32(2166136261) - for c in string: - h = h ^ np.uint32(ord(c)) - h = h * np.uint32(16777619) - np.seterr(**old_settings) - return h + """ + # Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed. + old_settings = np.seterr(all='ignore') + h = np.uint32(2166136261) + for c in string: + h = h ^ np.uint32(ord(c)) + h = h * np.uint32(16777619) + np.seterr(**old_settings) + return h diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index bf6ac7db98..7453f08b4b 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -326,9 +326,9 @@ def testDoesntMatch(self): def testHash(self): # Tests FastText.ft_hash method return values to those obtained from original C implementation - ft_hash = fasttext.FastText.ft_hash('test') + ft_hash = fasttext.ft_hash('test') self.assertEqual(ft_hash, 2949673445) - ft_hash = fasttext.FastText.ft_hash('word') + ft_hash = fasttext.ft_hash('word') self.assertEqual(ft_hash, 1788406269) From e12f6c034f8613b6cc355744e369c2ce6b946ecf Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 28 Aug 2017 04:05:49 +0530 Subject: [PATCH 22/32] reusing 'word_vec' from 'FastTextKeyedVectors' --- gensim/models/fasttext.py | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ca7cff6256..566b3fb451 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -128,7 +128,7 @@ def init_ngrams(self, update=False): logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} - ngram_indices = range(len(self.wv.vocab)) # keeping the first `len(self.wv.vocab)` rows intact + ngram_indices = list(range(len(self.wv.vocab))) # keeping the first `len(self.wv.vocab)` rows intact for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: @@ -214,26 +214,7 @@ def get_vocab_word_vecs(self): self.wv.syn0[v.index] = word_vec def word_vec(self, word, use_norm=False): - if word in self.wv.vocab: - if use_norm: - return self.wv.syn0norm[self.wv.vocab[word].index] - else: - return self.wv.syn0[self.wv.vocab[word].index] - else: - logger.info("Word is out of vocabulary") - word_vec = np.zeros(self.wv.syn0_all.shape[1]) - ngrams = Ft_Wrapper.compute_ngrams(word, self.min_n, self.max_n) - ngrams = [ng for ng in ngrams if ng in self.wv.ngrams] - if use_norm: - ngram_weights = self.wv.syn0_all_norm - else: - ngram_weights = self.wv.syn0_all - for ngram in ngrams: - word_vec += ngram_weights[self.wv.ngrams[ngram]] - if word_vec.any(): - return word_vec / len(ngrams) - else: # No ngrams of the word are present in self.ngrams - raise KeyError('all ngrams for word %s absent from model' % word) + return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm) @classmethod def load_fasttext_format(cls, *args, **kwargs): From 39d14bdf6661f39473c59c7b9393aaf979f8772a Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Mon, 28 Aug 2017 04:11:57 +0530 Subject: [PATCH 23/32] flake8 fixes --- gensim/models/fasttext.py | 6 +----- gensim/models/wrappers/fasttext.py | 3 ++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 566b3fb451..a47e3de8f3 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -3,13 +3,9 @@ import logging -from types import GeneratorType -from copy import deepcopy import numpy as np -from numpy import dot, zeros, ones, vstack, outer, random, sum as np_sum, empty, float32 as REAL -from scipy.special import expit +from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL -from gensim.utils import call_on_class_only from gensim.models.word2vec import Word2Vec, train_sg_pair, train_cbow_pair from gensim.models.wrappers.fasttext import FastTextKeyedVectors from gensim.models.wrappers.fasttext import FastText as Ft_Wrapper, compute_ngrams, ft_hash diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 221b6c33b0..6c6e8188e6 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -388,8 +388,8 @@ def init_ngrams(self): self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) logger.info("loaded %s weight matrix for fastText model from %s", self.wv.syn0.shape, self.file_name) + def compute_ngrams(word, min_n, max_n): - ngram_indices = [] BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix extended_word = BOW + word + EOW ngrams = [] @@ -398,6 +398,7 @@ def compute_ngrams(word, min_n, max_n): ngrams.append(extended_word[i:i + ngram_length]) return ngrams + def ft_hash(string): """ Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) From d3ec5a8eb497e0730d286edc1db1f5f78e6313ad Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 29 Aug 2017 19:52:12 +0530 Subject: [PATCH 24/32] split 'syn0_all' into 'syn0_vocab' and 'syn0_ngrams' --- gensim/models/fasttext.py | 78 +++++++++++++++------------- gensim/models/word2vec.py | 38 +++++++++----- gensim/models/wrappers/fasttext.py | 41 ++++++++------- gensim/test/test_fasttext.py | 25 +++++---- gensim/test/test_fasttext_wrapper.py | 18 +++---- 5 files changed, 115 insertions(+), 85 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index a47e3de8f3..5d6698108c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -27,18 +27,23 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] - subwords_indices = [] + vocab_subwords_indices = [] + ngrams_subwords_indices = [] for index in word2_indices: - subwords_indices += [index] + vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] for subword in word2_subwords: - subwords_indices.append(model.wv.ngrams[subword]) + ngrams_subwords_indices.append(model.wv.ngrams[subword]) + + l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size + l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size - l1 = np_sum(model.wv.syn0_all[subwords_indices], axis=0) # 1 x vector_size - if subwords_indices and model.cbow_mean: - l1 /= len(subwords_indices) + l1 = np_sum([l1_vocab, l1_ngrams], axis=0) + subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] + if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: + l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) # train on the sliding window for target word result += len(word_vocabs) @@ -76,24 +81,20 @@ def __init__( negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): - # fastText specific functions and params - self.initialize_ngram_vectors() + # fastText specific params self.bucket = bucket self.word_ngrams = word_ngrams self.min_n = min_n self.max_n = max_n if self.word_ngrams <= 1 and self.max_n == 0: self.bucket = 0 - self.wv.min_n = min_n - self.wv.max_n = max_n - self.wv.ngrams_word = {} super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, - trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words, init_wv=False) + trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words, init_wv=True) - def initialize_ngram_vectors(self): + def initialize_word_vectors(self): self.wv = FastTextKeyedVectors() def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): @@ -111,8 +112,11 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ def init_ngrams(self, update=False): if not update: self.wv.ngrams = {} - self.wv.syn0_all = empty((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) - self.syn0_all_lockf = ones((len(self.wv.vocab) + self.bucket, self.vector_size), dtype=REAL) + self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) + self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) + + self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) + self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): @@ -120,27 +124,26 @@ def init_ngrams(self, update=False): all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(self.wv.vocab) + len(all_ngrams) + self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} - ngram_indices = list(range(len(self.wv.vocab))) # keeping the first `len(self.wv.vocab)` rows intact + ngram_indices = [] + new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: - ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.hash2index[ngram_hash] = i + len(self.wv.vocab) + ngram_indices.append(ngram_hash % self.bucket) + self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + new_hash_count = new_hash_count + 1 - self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) + self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) + self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: - new_vocab_len = len(self.wv.vocab) - for ngram, idx in self.wv.hash2index.items(): - self.wv.hash2index[ngram] = idx + new_vocab_len - self.old_vocab_len - new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) @@ -148,30 +151,35 @@ def init_ngrams(self, update=False): new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) - new_count = 0 + new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) if ngram_hash not in self.wv.hash2index: - self.wv.hash2index[ngram_hash] = new_count + len(self.wv.vocab) + self.old_hash2index_len + self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] + new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - old_vocab_rows = self.wv.syn0_all[0:self.old_vocab_len, ] - old_ngram_rows = self.wv.syn0_all[self.old_vocab_len:, ] - rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size)) + new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)) + new_ngram_lockf_rows = ones((len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) - self.wv.syn0_all = vstack([old_vocab_rows, new_vocab_rows, old_ngram_rows, new_ngram_rows]) + self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) + self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) + self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) + self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows]) def reset_ngram_weights(self): rand_obj = np.random rand_obj.seed(self.seed) - for index in range(len(self.wv.vocab) + len(self.wv.ngrams)): - self.wv.syn0_all[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) + for index in range(len(self.wv.vocab)): + self.wv.syn0_vocab[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) + for index in range(len(self.wv.hash2index)): + self.wv.syn0_ngrams[index] = rand_obj.uniform(-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size) def _do_train_job(self, sentences, alpha, inits): work, neu1 = inits @@ -201,9 +209,9 @@ def __getitem__(self, word): def get_vocab_word_vecs(self): for w, v in self.wv.vocab.items(): - word_vec = self.wv.syn0_all[v.index] + word_vec = self.wv.syn0_vocab[v.index] ngrams = self.wv.ngrams_word[w] - ngram_weights = self.wv.syn0_all + ngram_weights = self.wv.syn0_ngrams for ngram in ngrams: word_vec += ngram_weights[self.wv.ngrams[ngram]] word_vec /= (len(ngrams) + 1) @@ -217,5 +225,5 @@ def load_fasttext_format(cls, *args, **kwargs): return Ft_Wrapper.load_fasttext_format(*args, **kwargs) def save(self, *args, **kwargs): - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm']) + kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) super(FastText, self).save(*args, **kwargs) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4c674d35ca..37c5fd5e6c 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -257,12 +257,14 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): if context_vectors is None: if is_ft: - context_vectors = model.wv.syn0_all + context_vectors_vocab = model.wv.syn0_vocab + context_vectors_ngrams = model.wv.syn0_ngrams else: context_vectors = model.wv.syn0 if context_locks is None: if is_ft: - context_locks = model.syn0_all_lockf + context_locks_vocab = model.syn0_vocab_lockf + context_locks_ngrams = model.syn0_ngrams_lockf else: context_locks = model.syn0_lockf @@ -271,9 +273,10 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h predict_word = model.wv.vocab[word] # target word (NN output) if is_ft: - l1 = np_sum(context_vectors[context_index], axis=0) + l1_vocab = context_vectors_vocab[context_index[0]] + l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) if context_index: - l1 /= len(context_index) + l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) else: l1 = context_vectors[context_index] # input word (NN input/projection layer) lock_factor = context_locks[context_index] @@ -318,8 +321,9 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h if learn_vectors: if is_ft: - for i in context_index: - model.wv.syn0_all[i] += neu1e * model.syn0_all_lockf[i] + model.wv.syn0_vocab[context_index[0]] += neu1e * model.syn0_vocab_lockf[context_index[0]] + for i in context_index[1:]: + model.wv.syn0_ngrams[i] += neu1e * model.syn0_ngrams_lockf[i] else: l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) return neu1e @@ -329,12 +333,14 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr context_vectors=None, context_locks=None, is_ft=False): if context_vectors is None: if is_ft: - context_vectors = model.wv.syn0_all + context_vectors_vocab = model.wv.syn0_vocab + context_vectors_ngrams = model.wv.syn0_ngrams else: context_vectors = model.wv.syn0 if context_locks is None: if is_ft: - context_locks = model.syn0_all_lockf + context_locks_vocab = model.syn0_vocab_lockf + context_locks_ngrams = model.syn0_ngrams_lockf else: context_locks = model.syn0_lockf @@ -376,10 +382,18 @@ def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=Tr if learn_vectors: # learn input -> hidden, here for all words in the window separately - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - context_vectors[i] += neu1e * context_locks[i] + if is_ft: + if not model.cbow_mean and input_word_indices: + neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) + for i in input_word_indices[0]: + context_vectors_vocab[i] += neu1e * context_locks_vocab[i] + for i in input_word_indices[1]: + context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] + else: + if not model.cbow_mean and input_word_indices: + neu1e /= len(input_word_indices) + for i in input_word_indices: + context_vectors[i] += neu1e * context_locks[i] return neu1e diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 6c6e8188e6..f8ce1efc31 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -54,12 +54,17 @@ class FastTextKeyedVectors(KeyedVectors): """ def __init__(self): super(FastTextKeyedVectors, self).__init__() - self.syn0_all_norm = None + self.syn0_vocab = None + self.syn0_vocab_norm = None + self.syn0_ngrams = None + self.syn0_ngrams_norm = None self.ngrams = {} + self.hash2index = {} + self.ngrams_word = {} def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm']) + kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) super(FastTextKeyedVectors, self).save(*args, **kwargs) def word_vec(self, word, use_norm=False): @@ -79,13 +84,13 @@ def word_vec(self, word, use_norm=False): if word in self.vocab: return super(FastTextKeyedVectors, self).word_vec(word, use_norm) else: - word_vec = np.zeros(self.syn0_all.shape[1]) + word_vec = np.zeros(self.syn0_ngrams.shape[1]) ngrams = compute_ngrams(word, self.min_n, self.max_n) ngrams = [ng for ng in ngrams if ng in self.ngrams] if use_norm: - ngram_weights = self.syn0_all_norm + ngram_weights = self.syn0_ngrams_norm else: - ngram_weights = self.syn0_all + ngram_weights = self.syn0_ngrams for ngram in ngrams: word_vec += ngram_weights[self.ngrams[ngram]] if word_vec.any(): @@ -105,14 +110,14 @@ def init_sims(self, replace=False): """ super(FastTextKeyedVectors, self).init_sims(replace) - if getattr(self, 'syn0_all_norm', None) is None or replace: + if getattr(self, 'syn0_ngrams_norm', None) is None or replace: logger.info("precomputing L2-norms of ngram weight vectors") if replace: - for i in xrange(self.syn0_all.shape[0]): - self.syn0_all[i, :] /= sqrt((self.syn0_all[i, :] ** 2).sum(-1)) - self.syn0_all_norm = self.syn0_all + for i in xrange(self.syn0_ngrams.shape[0]): + self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1)) + self.syn0_ngrams_norm = self.syn0_ngrams else: - self.syn0_all_norm = (self.syn0_all / sqrt((self.syn0_all ** 2).sum(-1))[..., newaxis]).astype(REAL) + self.syn0_ngrams_norm = (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL) def __contains__(self, word): """ @@ -219,7 +224,7 @@ def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, a def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_all_norm']) + kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) super(FastText, self).save(*args, **kwargs) @classmethod @@ -340,11 +345,11 @@ def load_vectors(self, file_handle): dtype = np.dtype(np.float64) self.num_original_vectors = num_vectors - self.wv.syn0_all = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) - self.wv.syn0_all = self.wv.syn0_all.reshape((num_vectors, dim)) - assert self.wv.syn0_all.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ + self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) + self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim)) + assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ 'mismatch between actual weight matrix shape {} and expected shape {}'.format( - self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size)) + self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size)) self.init_ngrams() @@ -365,7 +370,7 @@ def init_ngrams(self): for w, vocab in self.wv.vocab.items(): all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n) - self.wv.syn0[vocab.index] += np.array(self.wv.syn0_all[vocab.index]) + self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index]) all_ngrams = set(all_ngrams) self.num_ngram_vectors = len(all_ngrams) @@ -374,9 +379,9 @@ def init_ngrams(self): ngram_hash = ft_hash(ngram) ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) self.wv.ngrams[ngram] = i - self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0) + self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) - ngram_weights = self.wv.syn0_all + ngram_weights = self.wv.syn0_ngrams logger.info("loading weights for %s words for fastText model from %s", len(self.wv.vocab), self.file_name) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index afe34c159e..d92cf73352 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -74,7 +74,8 @@ def test_training(self): self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) - self.assertEqual(model.wv.syn0_all.shape[1], 10) + self.assertEqual(model.wv.syn0_vocab.shape[1], 10) + self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector @@ -90,7 +91,8 @@ def test_training(self): def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) - self.assertTrue(np.allclose(model.wv.syn0_all, model2.wv.syn0_all)) + self.assertTrue(np.allclose(model.wv.syn0_vocab, model2.wv.syn0_vocab)) + self.assertTrue(np.allclose(model.wv.syn0_ngrams, model2.wv.syn0_ngrams)) self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) if model.hs: self.assertTrue(np.allclose(model.syn1, model2.syn1)) @@ -107,7 +109,7 @@ def test_persistence(self): wv = model.wv wv.save(testfile()) loaded_wv = FastTextKeyedVectors.load(testfile()) - self.assertTrue(np.allclose(wv.syn0_all, loaded_wv.syn0_all)) + self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams)) @@ -117,17 +119,18 @@ def test_norm_vectors_not_saved(self): model.save(testfile()) loaded_model = FT_gensim.load(testfile()) self.assertTrue(loaded_model.wv.syn0norm is None) - self.assertTrue(loaded_model.wv.syn0_all_norm is None) + self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None) wv = model.wv wv.save(testfile()) loaded_kv = FastTextKeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) - self.assertTrue(loaded_kv.syn0_all_norm is None) + self.assertTrue(loaded_kv.syn0_ngrams_norm is None) def model_sanity(self, model): self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model.vector_size)) + self.assertEqual(model.wv.syn0_vocab.shape, (len(model.wv.vocab), model.vector_size)) + self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size)) def test_load_fasttext_format(self): try: @@ -137,7 +140,7 @@ def test_load_fasttext_format(self): vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(model.wv.vocab), vocab_size, model_size) - self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model_size)) + self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) expected_vec = [ -0.57144, @@ -187,7 +190,7 @@ def test_load_fasttext_new_format(self): vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) - self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, model_size)) + self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) expected_vec = [ -0.025627, @@ -228,7 +231,7 @@ def test_load_fasttext_new_format(self): self.assertEquals(new_model.wv.max_n, 6) self.assertEquals(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) - self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, new_model.vector_size)) + self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size)) # self.modelSanity(new_model) def test_load_model_with_non_ascii_vocab(self): @@ -417,9 +420,9 @@ def online_sanity(self, model): model.build_vocab(terro, update=True) # update vocab self.assertTrue('terrorism' in model.wv.vocab) self.assertTrue('orism>' in model.wv.ngrams) - orig0_all = np.copy(model.wv.syn0_all) + orig0_all = np.copy(model.wv.syn0_ngrams) model.train(terro, total_examples=len(terro), epochs=model.iter) - self.assertFalse(np.allclose(model.wv.syn0_all, orig0_all)) + self.assertFalse(np.allclose(model.wv.syn0_ngrams, orig0_all)) sim = model.n_similarity(['war'], ['terrorism']) self.assertLess(0., sim) diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index 7453f08b4b..2f3cf44f01 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -41,13 +41,13 @@ def setUp(self): def model_sanity(self, model): """Even tiny models trained on any corpus should pass these sanity checks""" self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model.vector_size)) + self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size)) def models_equal(self, model1, model2): self.assertEqual(len(model1.wv.vocab), len(model2.wv.vocab)) self.assertEqual(set(model1.wv.vocab.keys()), set(model2.wv.vocab.keys())) self.assertTrue(numpy.allclose(model1.wv.syn0, model2.wv.syn0)) - self.assertTrue(numpy.allclose(model1.wv.syn0_all, model2.wv.syn0_all)) + self.assertTrue(numpy.allclose(model1.wv.syn0_ngrams, model2.wv.syn0_ngrams)) def testTraining(self): """Test self.test_model successfully trained, parameters and weights correctly loaded""" @@ -60,7 +60,7 @@ def testTraining(self): self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(trained_model.wv.vocab), vocab_size) - self.assertEqual(trained_model.wv.syn0_all.shape[1], model_size) + self.assertEqual(trained_model.wv.syn0_ngrams.shape[1], model_size) self.model_sanity(trained_model) # Tests temporary training files deleted @@ -88,7 +88,7 @@ def testModelSize(self): self.ft_path, self.corpus_file, output_file=testfile(), size=20) self.assertEqual(test_model_size_20.vector_size, 20) self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20) - self.assertEqual(test_model_size_20.wv.syn0_all.shape[1], 20) + self.assertEqual(test_model_size_20.wv.syn0_ngrams.shape[1], 20) def testPersistence(self): """Test storing/loading the entire model.""" @@ -100,18 +100,18 @@ def testPersistence(self): self.models_equal(self.test_model, fasttext.FastText.load(testfile())) def testNormalizedVectorsNotSaved(self): - """Test syn0norm/syn0_all_norm aren't saved in model file""" + """Test syn0norm/syn0_ngrams_norm aren't saved in model file""" self.test_model.init_sims() self.test_model.save(testfile()) loaded = fasttext.FastText.load(testfile()) self.assertTrue(loaded.wv.syn0norm is None) - self.assertTrue(loaded.wv.syn0_all_norm is None) + self.assertTrue(loaded.wv.syn0_ngrams_norm is None) wv = self.test_model.wv wv.save(testfile()) loaded_kv = keyedvectors.KeyedVectors.load(testfile()) self.assertTrue(loaded_kv.syn0norm is None) - self.assertTrue(loaded_kv.syn0_all_norm is None) + self.assertTrue(loaded_kv.syn0_ngrams_norm is None) def testLoadFastTextFormat(self): """Test model successfully loaded from fastText .bin file""" @@ -122,7 +122,7 @@ def testLoadFastTextFormat(self): vocab_size, model_size = 1762, 10 self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(model.wv.vocab), vocab_size, model_size) - self.assertEqual(model.wv.syn0_all.shape, (model.num_ngram_vectors, model_size)) + self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) expected_vec = [ -0.57144, @@ -173,7 +173,7 @@ def testLoadFastTextNewFormat(self): vocab_size, model_size = 1763, 10 self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) - self.assertEqual(new_model.wv.syn0_all.shape, (new_model.num_ngram_vectors, model_size)) + self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) expected_vec = [ -0.025627, From 0854622777c2e4308c586e097180b32ef6033780 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 29 Aug 2017 20:10:23 +0530 Subject: [PATCH 25/32] removed 'init_wv' param from Word2Vec --- gensim/models/fasttext.py | 2 +- gensim/models/word2vec.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 5d6698108c..f2b2803502 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -92,7 +92,7 @@ def __init__( super(FastText, self).__init__(sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, - trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words, init_wv=True) + trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) def initialize_word_vectors(self): self.wv = FastTextKeyedVectors() diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 37c5fd5e6c..e09c37483b 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -430,8 +430,7 @@ def __init__( self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, - init_wv=True): + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -509,8 +508,7 @@ def __init__( else: logger.debug('Fast version of {0} is being used'.format(__name__)) - if init_wv: - self.initialize_word_vectors() + self.initialize_word_vectors() self.sg = int(sg) self.cum_table = None # for negative sampling self.vector_size = int(size) From 904882ad7cf3cbf002339353e58db62096040066 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 30 Aug 2017 19:23:50 +0530 Subject: [PATCH 26/32] updated unittests --- gensim/test/test_fasttext.py | 6 +++--- gensim/test/test_fasttext_wrapper.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index d92cf73352..85fc9e82dd 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -61,7 +61,7 @@ def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') - self.test_model = FT_gensim.load(self.test_model_file) + self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new') def test_training(self): @@ -180,7 +180,8 @@ def test_load_fasttext_format(self): self.assertEquals(model.bucket, 1000) self.assertEquals(model.wv.max_n, 6) self.assertEquals(model.wv.min_n, 3) - self.model_sanity(model) + self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) + self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size)) def test_load_fasttext_new_format(self): try: @@ -232,7 +233,6 @@ def test_load_fasttext_new_format(self): self.assertEquals(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size)) self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size)) - # self.modelSanity(new_model) def test_load_model_with_non_ascii_vocab(self): model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py index 2f3cf44f01..68fdf29f44 100644 --- a/gensim/test/test_fasttext_wrapper.py +++ b/gensim/test/test_fasttext_wrapper.py @@ -36,7 +36,7 @@ def setUp(self): self.test_model_file = datapath('lee_fasttext') self.test_new_model_file = datapath('lee_fasttext_new') # Load pre-trained model to perform tests in case FastText binary isn't available in test environment - self.test_model = fasttext.FastText.load(self.test_model_file) + self.test_model = fasttext.FastText.load_fasttext_format(self.test_model_file) def model_sanity(self, model): """Even tiny models trained on any corpus should pass these sanity checks""" From a9e7d03d209b514c697b45427513bb5c0a1c7478 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Wed, 30 Aug 2017 19:29:55 +0530 Subject: [PATCH 27/32] flake8 errors fixed --- gensim/models/word2vec.py | 4 ++-- gensim/models/wrappers/fasttext.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e09c37483b..05bcfe365c 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -321,9 +321,9 @@ def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_h if learn_vectors: if is_ft: - model.wv.syn0_vocab[context_index[0]] += neu1e * model.syn0_vocab_lockf[context_index[0]] + model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] for i in context_index[1:]: - model.wv.syn0_ngrams[i] += neu1e * model.syn0_ngrams_lockf[i] + model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] else: l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) return neu1e diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index f8ce1efc31..1b4aac5cce 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -113,7 +113,7 @@ def init_sims(self, replace=False): if getattr(self, 'syn0_ngrams_norm', None) is None or replace: logger.info("precomputing L2-norms of ngram weight vectors") if replace: - for i in xrange(self.syn0_ngrams.shape[0]): + for i in range(self.syn0_ngrams.shape[0]): self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1)) self.syn0_ngrams_norm = self.syn0_ngrams else: From ec5851227e26cd29ce15a599793cb27f32ac31c0 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 7 Sep 2017 21:22:19 +0530 Subject: [PATCH 28/32] fixed oov word_vec --- gensim/models/fasttext.py | 2 ++ gensim/models/wrappers/fasttext.py | 2 ++ gensim/test/test_fasttext.py | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index f2b2803502..72e28344cc 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -96,6 +96,8 @@ def __init__( def initialize_word_vectors(self): self.wv = FastTextKeyedVectors() + self.wv.min_n = self.min_n + self.wv.max_n = self.max_n def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): if update: diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index 1b4aac5cce..6c429d778c 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -61,6 +61,8 @@ def __init__(self): self.ngrams = {} self.hash2index = {} self.ngrams_word = {} + self.min_n = 0 + self.max_n = 0 def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 85fc9e82dd..ea00acaf5e 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -65,7 +65,7 @@ def setUp(self): self.test_new_model_file = datapath('lee_fasttext_new') def test_training(self): - model = FT_gensim(size=10, min_count=1, hs=1, negative=0) + model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42) model.build_vocab(sentences) self.model_sanity(model) @@ -85,9 +85,20 @@ def test_training(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0) + model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42) self.models_equal(model, model2) + # verify retrieval of vector for oov words + invocab_word = 'minors' + invocab_vec = model[invocab_word] + invocab_exp_vec = [0.00258422, 0.02300345, -0.00730846, 0.00102663, -0.02189803, -0.01087711, -0.00281926, 0.00180695, 0.02623167, -0.01155995] + self.assertTrue(np.allclose(invocab_vec, invocab_exp_vec, atol=1e-4)) + + oov_word = 'minor' + oov_vec = model[oov_word] + exp_oov_vec = [0.02412911, 0.0304156, 0.0029917, 0.00383261, -0.02695941, -0.0052078, 0.01498577, 0.01851997, 0.02577376, -0.02647647] + self.assertTrue(np.allclose(oov_vec, exp_oov_vec, atol=1e-4)) + def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertEqual(model.num_ngram_vectors, model2.num_ngram_vectors) From daace4a85bccd2fa9237d9503666927ee5675522 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Thu, 7 Sep 2017 22:45:14 +0530 Subject: [PATCH 29/32] updated test_training unittest --- gensim/models/wrappers/fasttext.py | 1 + gensim/test/test_fasttext.py | 18 +++++++----------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index ce1ff1c160..77bf42abb5 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -395,6 +395,7 @@ def init_ngrams(self): self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) logger.info("loaded %s weight matrix for fastText model from %s", self.wv.syn0.shape, self.file_name) + def compute_ngrams(word, min_n, max_n): BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix extended_word = BOW + word + EOW diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index ea00acaf5e..c6025c410a 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -65,7 +65,7 @@ def setUp(self): self.test_new_model_file = datapath('lee_fasttext_new') def test_training(self): - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42) + model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) @@ -85,19 +85,15 @@ def test_training(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42) + model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) - # verify retrieval of vector for oov words - invocab_word = 'minors' - invocab_vec = model[invocab_word] - invocab_exp_vec = [0.00258422, 0.02300345, -0.00730846, 0.00102663, -0.02189803, -0.01087711, -0.00281926, 0.00180695, 0.02623167, -0.01155995] - self.assertTrue(np.allclose(invocab_vec, invocab_exp_vec, atol=1e-4)) + # verify oov-word vector retrieval + invocab_vec = model['minors'] # invocab word + self.assertEqual(len(invocab_vec), 10) - oov_word = 'minor' - oov_vec = model[oov_word] - exp_oov_vec = [0.02412911, 0.0304156, 0.0029917, 0.00383261, -0.02695941, -0.0052078, 0.01498577, 0.01851997, 0.02577376, -0.02647647] - self.assertTrue(np.allclose(oov_vec, exp_oov_vec, atol=1e-4)) + oov_vec = model['minor'] # oov word + self.assertEqual(len(oov_vec), 10) def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) From 3ffa103cb64fce8f820ba8e8a9c07211c8b45804 Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Mon, 18 Sep 2017 19:33:53 +0500 Subject: [PATCH 30/32] Fix broken merge --- gensim/models/wrappers/fasttext.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py index eadce8b1e7..fa32825e4c 100644 --- a/gensim/models/wrappers/fasttext.py +++ b/gensim/models/wrappers/fasttext.py @@ -134,7 +134,7 @@ def __contains__(self, word): if word in self.vocab: return True else: - char_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n) + char_ngrams = compute_ngrams(word, self.min_n, self.max_n) return any(ng in self.ngrams for ng in char_ngrams) @@ -353,12 +353,12 @@ def load_vectors(self, file_handle): dtype = np.dtype(np.float64) self.num_original_vectors = num_vectors - self.wv.syn0_all = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) - self.wv.syn0_all = self.wv.syn0_all.reshape((num_vectors, dim)) - assert self.wv.syn0_all.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ + self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) + self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim)) + assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ 'mismatch between actual weight matrix shape {} and expected shape {}'\ .format( - self.wv.syn0_all.shape, (self.bucket + len(self.wv.vocab), self.vector_size) + self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size) ) self.init_ngrams() From 2b0583b9743a2b869415df9c8ce9a1d59b78e7f2 Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Tue, 19 Sep 2017 10:47:26 +0500 Subject: [PATCH 31/32] useless change (need to re-run Appveyour) --- docs/notebooks/FastText_Tutorial.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/notebooks/FastText_Tutorial.ipynb b/docs/notebooks/FastText_Tutorial.ipynb index a55f62d49e..6145cb835d 100644 --- a/docs/notebooks/FastText_Tutorial.ipynb +++ b/docs/notebooks/FastText_Tutorial.ipynb @@ -70,7 +70,8 @@ } ], "source": [ - "import gensim, os\n", + "import gensim\n", + "import os\n", "from gensim.models.word2vec import LineSentence\n", "from gensim.models.fasttext import FastText as FT_gensim\n", "\n", From 55d731afd9479f8880f555f308fc5c8c5c46123f Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Tue, 19 Sep 2017 11:59:15 +0500 Subject: [PATCH 32/32] Add skipIf for Appveyor x32 (avoid memory error) --- gensim/test/test_fasttext.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index c6025c410a..136f7de5b8 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -5,6 +5,7 @@ import unittest import tempfile import os +import struct import numpy as np @@ -18,6 +19,8 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) logger = logging.getLogger(__name__) +IS_WIN32 = (os.name == "nt") and (struct.calcsize('P') * 8 == 32) + class LeeCorpus(object): def __iter__(self): @@ -108,6 +111,7 @@ def models_equal(self, model, model2): most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model[most_common_word], model2[most_common_word])) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_persistence(self): model = FT_gensim(sentences, min_count=1) model.save(testfile()) @@ -120,6 +124,7 @@ def test_persistence(self): self.assertEqual(len(wv.vocab), len(loaded_wv.vocab)) self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams)) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_norm_vectors_not_saved(self): model = FT_gensim(sentences, min_count=1) model.init_sims() @@ -433,18 +438,22 @@ def online_sanity(self, model): sim = model.n_similarity(['war'], ['terrorism']) self.assertLess(0., sim) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_sg_hs_online(self): model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) self.online_sanity(model) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_sg_neg_online(self): model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=12) self.online_sanity(model) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_hs_online(self): model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=12) self.online_sanity(model) + @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_neg_online(self): model = FT_gensim(sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, min_count=5, iter=1, seed=42, workers=12, sample=0) self.online_sanity(model)