Skip to content

Commit

Permalink
Use CoherenceModel for LdaModel.top_topics. Fix piskvorky#1128 (p…
Browse files Browse the repository at this point in the history
…iskvorky#1427)

* Add a `get_topics` method to all topic models, add test coverage for this, and update the `CoherenceModel` to use this for getting topics from models.

* Require topics returned from `get_topics` to be probability distributions for the probabilistic topic models.

* Replace code in `LdaModel.top_topics` with use of `CoherenceModel`.

* Fix docstrings to use Google style throughout PR changes and various LdaModel methods.
  • Loading branch information
macks22 authored and fabriciorsf committed Aug 23, 2017
1 parent 109ffef commit b4bd541
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 171 deletions.
28 changes: 9 additions & 19 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@

from gensim import interfaces
from gensim.matutils import argsort
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
from gensim.topic_coherence import (segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation)
Expand Down Expand Up @@ -268,23 +266,15 @@ def _topics_differ(self, new_topics):

def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
topics = []
if isinstance(self.model, LdaModel):
for topic in self.model.state.get_lambda():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaVowpalWabbit):
for topic in self.model._get_topics():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaMallet):
for topic in self.model.word_topics:
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
else:
raise ValueError("This topic model is not currently supported. Supported topic models "
" are LdaModel, LdaVowpalWabbit and LdaMallet.")
return topics
try:
return [
argsort(topic, topn=self.topn, reverse=True) for topic in
self.model.get_topics()
]
except AttributeError:
raise ValueError(
"This topic model is not currently supported. Supported topic models"
" should implement the `get_topics` method.")

def segment_topics(self):
return self.measure.seg(self.topics)
Expand Down
14 changes: 12 additions & 2 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@
import logging
import time
import warnings

import numpy as np
from scipy.special import gammaln, psi # gamma function utils
from six.moves import xrange

from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation
from gensim.models import basemodel, ldamodel
from six.moves import xrange

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -456,6 +457,15 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No
hdp_formatter = HdpTopicFormatter(self.id2word, betas)
return hdp_formatter.show_topic(topic_id, topn, log, formatted)

def get_topics(self):
"""
Returns:
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
the term topic matrix learned during inference.
"""
topics = self.m_lambda + self.m_eta
return topics / topics.sum(axis=1)[:, None]

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `num_topics` number of topics.
Expand Down Expand Up @@ -642,7 +652,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= N
logger.info(topic)
else:
topic = (topic_id, topic_terms)

# we only return the topic_terms
return topic[1]

Expand Down
Loading

0 comments on commit b4bd541

Please sign in to comment.