Skip to content

Commit

Permalink
Fix PR #963
Browse files Browse the repository at this point in the history
  • Loading branch information
Partho Mandal committed Oct 26, 2016
1 parent 3b9bb59 commit df53afa
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 8 deletions.
8 changes: 5 additions & 3 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):


class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None):
def __init__(self, obj, corpus, chunksize=None, **kwargs):
self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
for key, value in kwargs.items(): #add the new parameters like per_word_topics to base class object of LdaModel
setattr(self.obj, key, value)
self.metadata = False

def __len__(self):
Expand Down Expand Up @@ -156,12 +158,12 @@ def __getitem__(self, vec):
raise NotImplementedError('cannot instantiate abstract base class')


def _apply(self, corpus, chunksize=None):
def _apply(self, corpus, chunksize=None, **kwargs):
"""
Apply the transformation to a whole corpus (as opposed to a single document)
and return the result as another corpus.
"""
return TransformedCorpus(self, corpus, chunksize)
return TransformedCorpus(self, corpus, chunksize, **kwargs)
#endclass TransformationABC


Expand Down
16 changes: 13 additions & 3 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
distributed=False, chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None, ns_conf={}):
minimum_probability=0.01, random_state=None, ns_conf={},
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -297,6 +298,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
self.passes = passes
self.update_every = update_every
self.eval_every = eval_every
self.minimum_phi_value = minimum_phi_value
self.per_word_topics = per_word_topics

self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

Expand Down Expand Up @@ -916,7 +919,12 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N
# if the input vector is a corpus, return a transformed corpus
is_corpus, corpus = utils.is_corpus(bow)
if is_corpus:
return self._apply(corpus)
kwargs = dict(
per_word_topics = per_word_topics,
minimum_probability = minimum_probability,
minimum_phi_value = minimum_phi_value
)
return self._apply(corpus, **kwargs)

gamma, phis = self.inference([bow], collect_sstats=True)
topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution
Expand Down Expand Up @@ -977,7 +985,9 @@ def __getitem__(self, bow, eps=None):
Ignore topics with very low probability (below `eps`).
"""
return self.get_document_topics(bow, eps)
#Is eps equivalent to minimum_probability?
eps = self.minimum_probability
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)

def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
"""
Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ class LdaMulticore(LdaModel):
def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None):
gamma_threshold=0.001, random_state=None,
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -144,7 +145,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state)
gamma_threshold=gamma_threshold, random_state=random_state,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)


def update(self, corpus, chunks_as_numpy=False):
Expand Down
18 changes: 18 additions & 0 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ def testGetDocumentTopics(self):
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

#Attempt test case to use the get_document_topic function for the corpus
doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus, per_word_topics=True)

self.assertEqual(model.state.numdocs, len(corpus))

for k, v in doc_topics:
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

for w, topic_list in word_topics:
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(topic_list, list))

for w, phi_values in word_phis:
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(phi_values, list))


doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus[1], per_word_topics=True)

for k, v in doc_topics:
Expand Down

0 comments on commit df53afa

Please sign in to comment.