piskvorky · tmylk · Apr 19, 2017 · Apr 1, 2017 · Apr 2, 2017 · Apr 3, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ Unreleased:
 
 New features:
 * Add output word prediction for negative sampling scheme. (@chinmayapancholi13,[#1209](https://github.com/RaRe-Technologies/gensim/pull/1209))
+* Add modified save_word2vec_format for Doc2Vec, to save document vectors. (@parulsethi,[#1256](https://github.com/RaRe-Technologies/gensim/pull/1256))
 
 Improvements:
 * Fix loading large FastText models on Mac. (@jaksmid,[#1196](https://github.com/RaRe-Technologies/gensim/pull/1214))

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -61,6 +61,7 @@
 from gensim.utils import call_on_class_only
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
 from gensim.models.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg
+from gensim.models.keyedvectors import KeyedVectors
 from six.moves import xrange, zip
 from six import string_types, integer_types, itervalues
 
@@ -808,6 +809,44 @@ def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inferen
         if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'):
             del self.docvecs.doctag_syn0_lockf
 
+    def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
+        """
+        Store the input-hidden weight matrix.
+
+         `fname` is the file used to save the vectors in
+         `doctag_vec` is an optional boolean indicating whether to store document vectors
+         `word_vec` is an optional boolean indicating whether to store word vectors
+         (if both doctag_vec and word_vec are True, then both vectors are stored in the same file)
+         `prefix` to uniquely identify doctags from word vocab, and avoid collision
+         in case of repeated string in doctag and word vocab
+         `fvocab` is an optional file used to save the vocabulary
+         `binary` is an optional boolean indicating whether the data is to be saved
+         in binary word2vec format (default: False)
+
+        """
+        total_vec = len(self.wv.vocab) + len(self.docvecs)
+        # save word vectors
+        if word_vec:
+            if not doctag_vec:
+                total_vec = len(self.wv.vocab)
+            KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
+        # save document vectors
+        if doctag_vec:
+            with utils.smart_open(fname, 'ab') as fout:
+                if not word_vec:
+                    total_vec = len(self.docvecs)
+                    logger.info("storing %sx%s projection weights into %s" % (total_vec, self.vector_size, fname))
+                    fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
+                # store as in input order
+                for i in range(len(self.docvecs)):
+                    doctag = prefix + str(self.docvecs.index_to_doctag(i))
+                    row = self.docvecs.doctag_syn0[i]
+                    if binary:
+                        fout.write(utils.to_utf8(doctag) + b" " + row.tostring())
+                    else:
+                        fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
+
+
 class TaggedBrownCorpus(object):
     """Iterate over documents from the Brown corpus (part of NLTK data), yielding
     each document out as a TaggedDocument object."""

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -118,7 +118,7 @@ def save(self, *args, **kwargs):
         kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
         super(KeyedVectors, self).save(*args, **kwargs)
 
-    def save_word2vec_format(self, fname, fvocab=None, binary=False):
+    def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
         """
         Store the input-hidden weight matrix in the same format used by the original
         C word2vec-tool, for compatibility.
@@ -127,18 +127,22 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
          `fvocab` is an optional file used to save the vocabulary
          `binary` is an optional boolean indicating whether the data is to be saved
          in binary word2vec format (default: False)
+         `total_vec` is an optional parameter to explicitly specify total no. of vectors
+         (in case word vectors are appended with document vectors afterwards)
 
         """
+        if total_vec is None:
+            total_vec = len(self.vocab)
         vector_size = self.syn0.shape[1]
         if fvocab is not None:
             logger.info("storing vocabulary in %s" % (fvocab))
             with utils.smart_open(fvocab, 'wb') as vout:
                 for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                     vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
-        logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), vector_size, fname))
+        logger.info("storing %sx%s projection weights into %s" % (total_vec, vector_size, fname))
         assert (len(self.vocab), vector_size) == self.syn0.shape
         with utils.smart_open(fname, 'wb') as fout:
-            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
+            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
             # store in sorted order: most frequent words at the top
             for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                 row = self.syn0[vocab.index]

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -23,7 +23,7 @@
 import numpy as np
 
 from gensim import utils, matutils
-from gensim.models import doc2vec
+from gensim.models import doc2vec, keyedvectors
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
 datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
@@ -76,6 +76,13 @@ def test_persistence(self):
         model.save(testfile())
         self.models_equal(model, doc2vec.Doc2Vec.load(testfile()))
 
+    def testPersistenceWord2VecFormat(self):
+        """Test storing the entire model in word2vec format."""
+        model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
+        model.save_word2vec_format(testfile(), doctag_vec=True, binary=True)
+        binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
+        self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab))
+
     def test_load_mmap(self):
         """Test storing/loading the entire model."""
         model = doc2vec.Doc2Vec(sentences, min_count=1)