From b99e8528594b0898aa629f01be0919a18f25f190 Mon Sep 17 00:00:00 2001 From: Hayate ISO Date: Tue, 5 Apr 2016 13:29:52 +0900 Subject: [PATCH] add similar by word and vector --- CHANGELOG.txt | 1 + gensim/models/word2vec.py | 40 ++++++++++++++++++++++++++++++++++++ gensim/test/test_word2vec.py | 10 +++++++++ 3 files changed, 51 insertions(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 4bd2a84bd0..9a1366bbf0 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -7,6 +7,7 @@ Changes - Allow easy port of GloVe vectors into Gensim - Standalone script with command line arguments, compatible with Python>=2.6 - Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt +* Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381) 0.12.4, 29/01/2016 * Better internal handling of job batching in word2vec (#535) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a30bba3a7d..3fd96fa33c 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1281,6 +1281,46 @@ def word_vec(word): result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn] + def similar_by_word(self, word, topn=10, restrict_vocab=None): + """ + Find the top-N most similar words. + + If topn is False, similar_by_word returns the vector of similarity scores. + + `restrict_vocab` is an optional integer which limits the range of vectors which + are searched for most-similar values. For example, restrict_vocab=10000 would + only check the first 10000 word vectors in the vocabulary order. (This may be + meaningful if you've sorted the vocabulary by descending frequency.) + + Example:: + + >>> trained_model.similar_by_word('graph') + [('user', 0.9999163150787354), ...] + + """ + + return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) + + def similar_by_vector(self, vector, topn=10, restrict_vocab=None): + """ + Find the top-N most similar words by vector. + + If topn is False, similar_by_vector returns the vector of similarity scores. + + `restrict_vocab` is an optional integer which limits the range of vectors which + are searched for most-similar values. For example, restrict_vocab=10000 would + only check the first 10000 word vectors in the vocabulary order. (This may be + meaningful if you've sorted the vocabulary by descending frequency.) + + Example:: + + >>> trained_model.similar_by_vector([1,2]) + [('survey', 0.9942699074745178), ...] + + """ + + return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) + def doesnt_match(self, words): """ Which word from the given list doesn't go with the others? diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 894c8e7314..0f1c58208a 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -343,6 +343,16 @@ def testSimilarities(self): self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) + def testSimilarBy(self): + """Test word2vec similar_by_word and similar_by_vector.""" + model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + wordsims = model.similar_by_word('graph', topn=10) + wordsims2 = model.most_similar(positive='graph', topn=10) + vectorsims = model.similar_by_vector(model['graph'], topn=10) + vectorsims2 = model.most_similar([model['graph']], topn=10) + self.assertEqual(wordsims, wordsims2) + self.assertEqual(vectorsims, vectorsims2) + def testParallel(self): """Test word2vec parallel training.""" if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow)