Use CoherenceModel for LdaModel.top_topics. Fix piskvorky#1128 (p…

…iskvorky#1427) * Add a `get_topics` method to all topic models, add test coverage for this, and update the `CoherenceModel` to use this for getting topics from models. * Require topics returned from `get_topics` to be probability distributions for the probabilistic topic models. * Replace code in `LdaModel.top_topics` with use of `CoherenceModel`. * Fix docstrings to use Google style throughout PR changes and various LdaModel methods.
LINE-PESC · Aug 23, 2017 · b4bd541 · b4bd541
1 parent 109ffef
commit b4bd541
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 171 deletions.
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -26,8 +26,6 @@
 
 from gensim import interfaces
 from gensim.matutils import argsort
-from gensim.models.ldamodel import LdaModel
-from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
@@ -268,23 +266,15 @@ def _topics_differ(self, new_topics):
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
-        topics = []
-        if isinstance(self.model, LdaModel):
-            for topic in self.model.state.get_lambda():
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        elif isinstance(self.model, LdaVowpalWabbit):
-            for topic in self.model._get_topics():
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        elif isinstance(self.model, LdaMallet):
-            for topic in self.model.word_topics:
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        else:
-            raise ValueError("This topic model is not currently supported. Supported topic models "
-                             " are LdaModel, LdaVowpalWabbit and LdaMallet.")
-        return topics
+        try:
+            return [
+                argsort(topic, topn=self.topn, reverse=True) for topic in
+                self.model.get_topics()
+            ]
+        except AttributeError:
+            raise ValueError(
+                "This topic model is not currently supported. Supported topic models"
+                " should implement the `get_topics` method.")
 
     def segment_topics(self):
         return self.measure.seg(self.topics)

diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -36,13 +36,14 @@
 import logging
 import time
 import warnings
+
 import numpy as np
 from scipy.special import gammaln, psi  # gamma function utils
+from six.moves import xrange
 
 from gensim import interfaces, utils, matutils
 from gensim.matutils import dirichlet_expectation
 from gensim.models import basemodel, ldamodel
-from six.moves import xrange
 
 logger = logging.getLogger(__name__)
 
@@ -456,6 +457,15 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No
         hdp_formatter = HdpTopicFormatter(self.id2word, betas)
         return hdp_formatter.show_topic(topic_id, topn, log, formatted)
 
+    def get_topics(self):
+        """
+        Returns:
+            np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
+            the term topic matrix learned during inference.
+        """
+        topics = self.m_lambda + self.m_eta
+        return topics / topics.sum(axis=1)[:, None]
+
     def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
         """
         Print the `num_words` most probable words for `num_topics` number of topics.
@@ -642,7 +652,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= N
                 logger.info(topic)
         else:
             topic = (topic_id, topic_terms)
-        
+
         # we only return the topic_terms
         return topic[1]