From c67e5ece160b620155ec70bacd964dcddac77755 Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Wed, 8 Apr 2015 12:03:14 +0200 Subject: [PATCH 1/8] Changed the way 'ignore' is handled in LdaModel.save so that it allows passing in custom ignore parameters as well. --- gensim/models/ldamodel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index ce4ab4d526..713dcb9420 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -850,7 +850,11 @@ def save(self, fname, *args, **kwargs): """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) - super(LdaModel, self).save(fname, *args, ignore=['state', 'dispatcher'], **kwargs) + if 'ignore' in kwargs: + kwargs['ignore'] = list(set(['state', 'dispatcher']) | set(kwargs['ignore'])) + else: + kwargs['ignore'] = ['state', 'dispatcher'] + super(LdaModel, self).save(fname, *args, **kwargs) @classmethod def load(cls, fname, *args, **kwargs): From 780cd1b39f97bd0bb3f3e27caba1486aa0ad3dae Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Wed, 8 Apr 2015 12:17:25 +0200 Subject: [PATCH 2/8] Added a test to make sure that 'ignore' in LdaModel.save works correctly and added a check to LdaModel.save to make sure that the passed in is a list --- gensim/models/ldamodel.py | 5 ++++- gensim/test/test_models.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 713dcb9420..d08684634d 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -851,7 +851,10 @@ def save(self, fname, *args, **kwargs): if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) if 'ignore' in kwargs: - kwargs['ignore'] = list(set(['state', 'dispatcher']) | set(kwargs['ignore'])) + ignore = kwargs['ignore'] + if isinstance(ignore, basestring): + ignore = [ignore] + kwargs['ignore'] = list(set(['state', 'dispatcher']) | set(ignore)) else: kwargs['ignore'] = ['state', 'dispatcher'] super(LdaModel, self).save(fname, *args, **kwargs) diff --git a/gensim/test/test_models.py b/gensim/test/test_models.py index f63b436143..038d5be2c2 100644 --- a/gensim/test/test_models.py +++ b/gensim/test/test_models.py @@ -322,6 +322,17 @@ def testPersistence(self): tstvec = [] self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + def testPersistenceIgnore(self): + fname = testfile() + model = ldamodel.LdaModel(self.corpus, num_topics=2) + model.save(fname, ignore='id2word') + model2 = ldamodel.LdaModel.load(fname) + self.assertTrue(model2.id2word is None) + + model.save(fname, ignore=['id2word']) + model2 = ldamodel.LdaModel.load(fname) + self.assertTrue(model2.id2word is None) + def testPersistenceCompressed(self): fname = testfile() + '.gz' model = self.class_(self.corpus, num_topics=2) From 4bf1247c2e1f1008e2e20befc019f39da24a3eb8 Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Wed, 8 Apr 2015 13:12:46 +0200 Subject: [PATCH 3/8] closes #314 to make sure that 'ignore' in LdaModel.save works correctly and added a check to LdaModel.save to make sure that the passed in is a list --- gensim/models/ldamodel.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index d08684634d..8194d93897 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -850,6 +850,9 @@ def save(self, fname, *args, **kwargs): """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) + + # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if + # someone sets the ignore list themselves if 'ignore' in kwargs: ignore = kwargs['ignore'] if isinstance(ignore, basestring): From c4ef71cf04f3a3e22e8a5e611b52d49b7b5e8cba Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Wed, 8 Apr 2015 13:16:48 +0200 Subject: [PATCH 4/8] Fixed a 2 to 3 incompatible isinstance check to basestring, using six.string_types instead. o LdaModel.save to make sure that the passed in is a list --- gensim/models/ldamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 8194d93897..0f2cb4b69d 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -855,7 +855,7 @@ def save(self, fname, *args, **kwargs): # someone sets the ignore list themselves if 'ignore' in kwargs: ignore = kwargs['ignore'] - if isinstance(ignore, basestring): + if isinstance(ignore, six.string_types): ignore = [ignore] kwargs['ignore'] = list(set(['state', 'dispatcher']) | set(ignore)) else: From a26ba847457c29733c81d6c0452182749e8bf90f Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Mon, 10 Aug 2015 10:55:31 +0200 Subject: [PATCH 5/8] Resolved merge conflict with upstream develop. basestring, using six.string_types instead. o LdaModel.save to make sure that the passed in is a list --- gensim/models/ldamodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 0f2cb4b69d..ed6c4f79d5 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -41,6 +41,7 @@ from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma from six.moves import xrange +import six # log(sum(exp(x))) that tries to avoid overflow try: From d092862370229610c937c07096902bfabe602c39 Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Mon, 10 Aug 2015 11:00:41 +0200 Subject: [PATCH 6/8] Added a bit of documentation to clarify how the ignore list is used. .string_types instead. o LdaModel.save to make sure that the passed in is a list --- gensim/models/ldamodel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index ed6c4f79d5..84b29de988 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -834,7 +834,11 @@ def save(self, fname, *args, **kwargs): """ Save the model to file. - Large internal arrays may be stored into separate files, with `fname` as prefix. + Large internal arrays may be stored into separate files, with `fname` as prefix.Use + `separately=[]` to define which arrays should be stored in separate + files. The `ignore` parameter can be used to define which variables should be ignored. + By default the internal `state` is ignored as it uses its own serialisation not the + one provided by `LdaModel`. Note: do not save as a compressed file if you intend to load the file back with `mmap`. From 2d6883dbe6069e952e3b66a746e2566ec62c989b Mon Sep 17 00:00:00 2001 From: Matti Lyra Date: Mon, 10 Aug 2015 11:31:32 +0200 Subject: [PATCH 7/8] Modified so that the ignore argument is explicitly mentioned in the arguments. Also added a bit more documentation to make it clear how the parameter should be used. --- gensim/models/ldamodel.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 84b29de988..a5dcaa42ef 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -830,15 +830,19 @@ def __getitem__(self, bow, eps=None): """ return self.get_document_topics(bow, eps) - def save(self, fname, *args, **kwargs): + def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ Save the model to file. - Large internal arrays may be stored into separate files, with `fname` as prefix.Use - `separately=[]` to define which arrays should be stored in separate - files. The `ignore` parameter can be used to define which variables should be ignored. - By default the internal `state` is ignored as it uses its own serialisation not the - one provided by `LdaModel`. + Large internal arrays may be stored into separate files, with `fname` as prefix. + + `separately` can be used to define which arrays should be stored in separate files. + + `ignore` parameter can be used to define which variables should be ignored, i.e. left + out from the pickled lda model. By default the internal `state` is ignored as it uses + its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher + will be added to any ignore parameter defined. + Note: do not save as a compressed file if you intend to load the file back with `mmap`. @@ -858,14 +862,14 @@ def save(self, fname, *args, **kwargs): # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves - if 'ignore' in kwargs: - ignore = kwargs['ignore'] + if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] - kwargs['ignore'] = list(set(['state', 'dispatcher']) | set(ignore)) + ignore = [e for e in ignore if e] # make sure None and '' are not in the list + ignore = list(set(['state', 'dispatcher']) | set(ignore)) else: - kwargs['ignore'] = ['state', 'dispatcher'] - super(LdaModel, self).save(fname, *args, **kwargs) + ignore = ['state', 'dispatcher'] + super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs) @classmethod def load(cls, fname, *args, **kwargs): From 368934fb1c8055c39c88424d679e51640950e929 Mon Sep 17 00:00:00 2001 From: wangqi Date: Tue, 11 Aug 2015 17:44:29 +0800 Subject: [PATCH 8/8] incremental train for docs in doc2vec --- gensim/models/doc2vec.py | 48 +++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 1b143d92ad..3ce772eae4 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -37,6 +37,7 @@ import logging import os import warnings +import itertools try: from queue import Queue @@ -288,7 +289,8 @@ def note_doctag(self, key, document_no, document_length): self.count = max(self.count, key+1) else: if key in self.doctags: - self.doctags[key] = self.doctags[key].repeat(document_length) + #self.doctags[key] = self.doctags[key].repeat(document_length) + return else: self.doctags[key] = Doctag(len(self.index2doctag), document_length, 1) self.index2doctag.append(key) @@ -354,7 +356,7 @@ def estimated_lookup_memory(self): """Estimated memory for tag lookup; 0 if using pure int tags.""" return 60 * len(self.index2doctag) + 140 * len(self.doctags) - def reset_weights(self, model): + def reset_space(self, model): length = max(len(self.doctags), self.count) if self.mapfile_path: self.doctag_syn0 = np_memmap(self.mapfile_path+'.doctag_syn0', dtype=REAL, @@ -366,10 +368,35 @@ def reset_weights(self, model): self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning + def init_weight(self, model, i): + # construct deterministic seed from index AND model seed + seed = "%d %s" % (model.seed, self.index2doctag[i] if len(self.index2doctag) > 0 else str(i)) + self.doctag_syn0[i] = model.seeded_vector(seed) + + def reset_weights(self, model): + self.reset_space(model) + length = max(len(self.doctags), self.count) for i in xrange(length): - # construct deterministic seed from index AND model seed - seed = "%d %s" % (model.seed, self.index2doctag[i] if len(self.index2doctag) > 0 else str(i)) - self.doctag_syn0[i] = model.seeded_vector(seed) + self.init_weight(model, i) + + def add_new_docs(self, documents, model): + old_length = len(self.index2doctag) + for document in documents: + document_length = len(document.words) + for tag in document.tags: + self.note_doctag(tag, 0, document_length) + # cache old + doctag_syn0_tmp = self.doctag_syn0 + doctag_syn0_lockf_tmp = self.doctag_syn0_lockf + self.reset_space(model) + # restore old + for i in xrange(old_length): + self.doctag_syn0[i] = doctag_syn0_tmp[i] + self.doctag_syn0_lockf[i] = doctag_syn0_lockf_tmp[i] + # init new + length = max(len(self.doctags), self.count) + for i in xrange(old_length, length): + self.init_weight(model, i) def init_sims(self, replace=False): """ @@ -630,11 +657,22 @@ def scan_vocab(self, documents, progress_per=10000): self.corpus_count = document_no + 1 self.raw_vocab = vocab + def train_new_doc(self, documents, total_words=None, word_count=0, chunksize=100, total_examples=None, queue_factor=2, report_delay=1): + # add new docs and initialize their weights + doc_iter1, doc_iter2 = itertools.tee(documents) + self.docvecs.add_new_docs(doc_iter1, self) + self.train(doc_iter2, total_words, word_count, chunksize, total_examples, queue_factor, report_delay) + def _do_train_job(self, job, alpha, inits): work, neu1 = inits tally = 0 raw_tally = 0 for doc in job: + # add the doc to dict if it's not already there + #document_length = len(doc.words) + #for tag in doc.tags: + #self.docvecs.note_doctag(tag, 0, document_length) + indexed_doctags = self.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags if self.sg: