add similar by word and vector

piskvorky · Apr 29, 2016 · b99e852 · b99e852
1 parent 4fb424c
commit b99e852
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 0 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -7,6 +7,7 @@ Changes
   - Allow easy port of GloVe vectors into Gensim
   - Standalone script with command line arguments, compatible with Python>=2.6 
   - Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt
+* Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381)
 
 0.12.4, 29/01/2016
 * Better internal handling of job batching in word2vec (#535)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1281,6 +1281,46 @@ def word_vec(word):
         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
         return result[:topn]
 
+    def similar_by_word(self, word, topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words.
+
+        If topn is False, similar_by_word returns the vector of similarity scores.
+
+        `restrict_vocab` is an optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 word vectors in the vocabulary order. (This may be
+        meaningful if you've sorted the vocabulary by descending frequency.)
+
+        Example::
+
+          >>> trained_model.similar_by_word('graph')
+          [('user', 0.9999163150787354), ...]
+
+        """
+
+        return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
+
+    def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words by vector.
+
+        If topn is False, similar_by_vector returns the vector of similarity scores.
+
+        `restrict_vocab` is an optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 word vectors in the vocabulary order. (This may be
+        meaningful if you've sorted the vocabulary by descending frequency.)
+
+        Example::
+
+          >>> trained_model.similar_by_vector([1,2])
+          [('survey', 0.9942699074745178), ...]
+
+        """
+
+        return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
+
     def doesnt_match(self, words):
         """
         Which word from the given list doesn't go with the others?

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -343,6 +343,16 @@ def testSimilarities(self):
         self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
         self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))
 
+    def testSimilarBy(self):
+        """Test word2vec similar_by_word and similar_by_vector."""
+        model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
+        wordsims = model.similar_by_word('graph', topn=10)
+        wordsims2 = model.most_similar(positive='graph', topn=10)
+        vectorsims = model.similar_by_vector(model['graph'], topn=10)
+        vectorsims2 = model.most_similar([model['graph']], topn=10)
+        self.assertEqual(wordsims, wordsims2)
+        self.assertEqual(vectorsims, vectorsims2)
+
     def testParallel(self):
         """Test word2vec parallel training."""
         if word2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)