Skip to content

Commit

Permalink
add similar by word and vector
Browse files Browse the repository at this point in the history
  • Loading branch information
isomap authored and tmylk committed Apr 29, 2016
1 parent 4fb424c commit b99e852
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Changes
- Allow easy port of GloVe vectors into Gensim
- Standalone script with command line arguments, compatible with Python>=2.6
- Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt
* Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381)

0.12.4, 29/01/2016
* Better internal handling of job batching in word2vec (#535)
Expand Down
40 changes: 40 additions & 0 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,46 @@ def word_vec(word):
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
return result[:topn]

def similar_by_word(self, word, topn=10, restrict_vocab=None):
"""
Find the top-N most similar words.
If topn is False, similar_by_word returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.similar_by_word('graph')
[('user', 0.9999163150787354), ...]
"""

return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)

def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
"""
Find the top-N most similar words by vector.
If topn is False, similar_by_vector returns the vector of similarity scores.
`restrict_vocab` is an optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 word vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Example::
>>> trained_model.similar_by_vector([1,2])
[('survey', 0.9942699074745178), ...]
"""

return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)

def doesnt_match(self, words):
"""
Which word from the given list doesn't go with the others?
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,16 @@ def testSimilarities(self):
self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))

def testSimilarBy(self):
"""Test word2vec similar_by_word and similar_by_vector."""
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
wordsims = model.similar_by_word('graph', topn=10)
wordsims2 = model.most_similar(positive='graph', topn=10)
vectorsims = model.similar_by_vector(model['graph'], topn=10)
vectorsims2 = model.most_similar([model['graph']], topn=10)
self.assertEqual(wordsims, wordsims2)
self.assertEqual(vectorsims, vectorsims2)

def testParallel(self):
"""Test word2vec parallel training."""
if word2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow)
Expand Down

0 comments on commit b99e852

Please sign in to comment.