Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added save method for doc2vec #1256

Merged
merged 9 commits into from
Apr 19, 2017
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Unreleased:

New features:
* Add output word prediction for negative sampling scheme. (@chinmayapancholi13,[#1209](https://github.com/RaRe-Technologies/gensim/pull/1209))
* Add modified save_word2vec_format for Doc2Vec, to save document vectors. (@parulsethi,[#1256](https://github.com/RaRe-Technologies/gensim/pull/1256))

Improvements:
* Fix loading large FastText models on Mac. (@jaksmid,[#1196](https://github.com/RaRe-Technologies/gensim/pull/1214))
Expand Down
39 changes: 39 additions & 0 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from gensim.utils import call_on_class_only
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.models.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg
from gensim.models.keyedvectors import KeyedVectors
from six.moves import xrange, zip
from six import string_types, integer_types, itervalues

Expand Down Expand Up @@ -808,6 +809,44 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'):
del self.docvecs.doctag_syn0_lockf

def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
"""
Store the input-hidden weight matrix.

`fname` is the file used to save the vectors in
`doctag_vec` is an optional boolean indicating whether to store document vectors
`word_vec` is an optional boolean indicating whether to store word vectors
(if both doctag_vec and word_vec are True, then both vectors are stored in the same file)
`prefix` to uniquely identify doctags from word vocab, and avoid collision
in case of repeated string in doctag and word vocab
`fvocab` is an optional file used to save the vocabulary
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The potential to save the vocabulary, with particular index-positions that correspond to the word-vectors only, makes me think that when both word+doc vectors are stored, the word-vectors should go first. Then, at least, any vocab written aligns one-for-one with the word-vectors portion of the save file. (Also: does fvocab currently do anything in the save-both case?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, word-vectors go first now.

(Also: does fvocab currently do anything in the save-both case?)

It didn't, earlier. But now that only KeyedVectors.save_word2vec is used for save-only-wv and save-both, vocab is saved in both the cases.

`binary` is an optional boolean indicating whether the data is to be saved
in binary word2vec format (default: False)

"""
total_vec = len(self.wv.vocab) + len(self.docvecs)
# save word vectors
if word_vec:
if not doctag_vec:
total_vec = len(self.wv.vocab)
KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
# save document vectors
if doctag_vec:
with utils.smart_open(fname, 'ab') as fout:
if not word_vec:
total_vec = len(self.docvecs)
logger.info("storing %sx%s projection weights into %s" % (total_vec, self.vector_size, fname))
fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
# store as in input order
for i in range(len(self.docvecs)):
doctag = prefix + str(self.docvecs.index_to_doctag(i))
row = self.docvecs.doctag_syn0[i]
if binary:
fout.write(utils.to_utf8(doctag) + b" " + row.tostring())
else:
fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))


class TaggedBrownCorpus(object):
"""Iterate over documents from the Brown corpus (part of NLTK data), yielding
each document out as a TaggedDocument object."""
Expand Down
10 changes: 7 additions & 3 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def save(self, *args, **kwargs):
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
super(KeyedVectors, self).save(*args, **kwargs)

def save_word2vec_format(self, fname, fvocab=None, binary=False):
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
"""
Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
Expand All @@ -127,18 +127,22 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
`fvocab` is an optional file used to save the vocabulary
`binary` is an optional boolean indicating whether the data is to be saved
in binary word2vec format (default: False)
`total_vec` is an optional parameter to explicitly specify total no. of vectors
(in case word vectors are appended with document vectors afterwards)

"""
if total_vec is None:
total_vec = len(self.vocab)
vector_size = self.syn0.shape[1]
if fvocab is not None:
logger.info("storing vocabulary in %s" % (fvocab))
with utils.smart_open(fvocab, 'wb') as vout:
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), vector_size, fname))
logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname))
assert (len(self.vocab), vector_size) == self.syn0.shape
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
# store in sorted order: most frequent words at the top
for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
row = self.syn0[vocab.index]
Expand Down
9 changes: 8 additions & 1 deletion gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import numpy as np

from gensim import utils, matutils
from gensim.models import doc2vec
from gensim.models import doc2vec, keyedvectors

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
Expand Down Expand Up @@ -76,6 +76,13 @@ def test_persistence(self):
model.save(testfile())
self.models_equal(model, doc2vec.Doc2Vec.load(testfile()))

def testPersistenceWord2VecFormat(self):
"""Test storing the entire model in word2vec format."""
model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
model.save_word2vec_format(testfile(), doctag_vec=True, binary=True)
binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add tests for more combinations of word_vec/doctag_vec True/False


def test_load_mmap(self):
"""Test storing/loading the entire model."""
model = doc2vec.Doc2Vec(sentences, min_count=1)
Expand Down