From df53afa353030fabbe3ee2d3d3cb6410088c97e5 Mon Sep 17 00:00:00 2001 From: Partho Mandal Date: Tue, 25 Oct 2016 17:44:56 -0700 Subject: [PATCH] Fix PR #963 --- gensim/interfaces.py | 8 +++++--- gensim/models/ldamodel.py | 16 +++++++++++++--- gensim/models/ldamulticore.py | 6 ++++-- gensim/test/test_ldamodel.py | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/gensim/interfaces.py b/gensim/interfaces.py index e1024723b0..530fab398b 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -105,8 +105,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): class TransformedCorpus(CorpusABC): - def __init__(self, obj, corpus, chunksize=None): + def __init__(self, obj, corpus, chunksize=None, **kwargs): self.obj, self.corpus, self.chunksize = obj, corpus, chunksize + for key, value in kwargs.items(): #add the new parameters like per_word_topics to base class object of LdaModel + setattr(self.obj, key, value) self.metadata = False def __len__(self): @@ -156,12 +158,12 @@ def __getitem__(self, vec): raise NotImplementedError('cannot instantiate abstract base class') - def _apply(self, corpus, chunksize=None): + def _apply(self, corpus, chunksize=None, **kwargs): """ Apply the transformation to a whole corpus (as opposed to a single document) and return the result as another corpus. """ - return TransformedCorpus(self, corpus, chunksize) + return TransformedCorpus(self, corpus, chunksize, **kwargs) #endclass TransformationABC diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 7def8966b3..c8ea3a5551 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -216,7 +216,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, - minimum_probability=0.01, random_state=None, ns_conf={}): + minimum_probability=0.01, random_state=None, ns_conf={}, + minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -297,6 +298,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.passes = passes self.update_every = update_every self.eval_every = eval_every + self.minimum_phi_value = minimum_phi_value + self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') @@ -916,7 +919,12 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: - return self._apply(corpus) + kwargs = dict( + per_word_topics = per_word_topics, + minimum_probability = minimum_probability, + minimum_phi_value = minimum_phi_value + ) + return self._apply(corpus, **kwargs) gamma, phis = self.inference([bow], collect_sstats=True) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution @@ -977,7 +985,9 @@ def __getitem__(self, bow, eps=None): Ignore topics with very low probability (below `eps`). """ - return self.get_document_topics(bow, eps) + #Is eps equivalent to minimum_probability? + eps = self.minimum_probability + return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics) def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index e7ab9c983c..f66c76bf04 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -80,7 +80,8 @@ class LdaMulticore(LdaModel): def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, - gamma_threshold=0.001, random_state=None): + gamma_threshold=0.001, random_state=None, + minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). @@ -144,7 +145,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta, decay=decay, offset=offset, eval_every=eval_every, iterations=iterations, - gamma_threshold=gamma_threshold, random_state=random_state) + gamma_threshold=gamma_threshold, random_state=random_state, + minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics) def update(self, corpus, chunks_as_numpy=False): diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index a96d96ae6f..4304aac78a 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -249,6 +249,24 @@ def testGetDocumentTopics(self): self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) + #Attempt test case to use the get_document_topic function for the corpus + doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus, per_word_topics=True) + + self.assertEqual(model.state.numdocs, len(corpus)) + + for k, v in doc_topics: + self.assertTrue(isinstance(k, int)) + self.assertTrue(isinstance(v, float)) + + for w, topic_list in word_topics: + self.assertTrue(isinstance(w, int)) + self.assertTrue(isinstance(topic_list, list)) + + for w, phi_values in word_phis: + self.assertTrue(isinstance(w, int)) + self.assertTrue(isinstance(phi_values, list)) + + doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus[1], per_word_topics=True) for k, v in doc_topics: