piskvorky · menshikh-iv · Aug 18, 2017 · Jun 14, 2017 · Jun 14, 2017 · Jun 18, 2017
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -26,8 +26,6 @@
 
 from gensim import interfaces
 from gensim.matutils import argsort
-from gensim.models.ldamodel import LdaModel
-from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
@@ -261,23 +259,15 @@ def _topics_differ(self, new_topics):
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
-        topics = []
-        if isinstance(self.model, LdaModel):
-            for topic in self.model.state.get_lambda():
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        elif isinstance(self.model, LdaVowpalWabbit):
-            for topic in self.model._get_topics():
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        elif isinstance(self.model, LdaMallet):
-            for topic in self.model.word_topics:
-                bestn = argsort(topic, topn=self.topn, reverse=True)
-                topics.append(bestn)
-        else:
-            raise ValueError("This topic model is not currently supported. Supported topic models "
-                             " are LdaModel, LdaVowpalWabbit and LdaMallet.")
-        return topics
+        try:
+            return [
+                argsort(topic, topn=self.topn, reverse=True) for topic in
+                self.model.get_topics()
+            ]
+        except AttributeError:
+            raise ValueError(
+                "This topic model is not currently supported. Supported topic models"
+                " should implement the `get_topics` method.")
 
     def segment_topics(self):
         return self.measure.seg(self.topics)

diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -36,13 +36,14 @@
 import logging
 import time
 import warnings
+
 import numpy as np
 from scipy.special import gammaln, psi  # gamma function utils
+from six.moves import xrange
 
 from gensim import interfaces, utils, matutils
 from gensim.matutils import dirichlet_expectation
 from gensim.models import basemodel, ldamodel
-from six.moves import xrange
 
 logger = logging.getLogger(__name__)
 
@@ -456,6 +457,14 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No
         hdp_formatter = HdpTopicFormatter(self.id2word, betas)
         return hdp_formatter.show_topic(topic_id, topn, log, formatted)
 
+    def get_topics(self):
+        """
+        Return the term topic matrix learned during inference.
+        This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
+        """
+        topics = self.m_lambda + self.m_eta
+        return topics / topics.sum(axis=1)[:, None]
+
     def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
         """
         Print the `num_words` most probable words for `num_topics` number of topics.
@@ -642,7 +651,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= N
                 logger.info(topic)
         else:
             topic = (topic_id, topic_terms)
-        
+
         # we only return the topic_terms
         return topic[1]
 

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -31,21 +31,20 @@
 
 
 import logging
-import numpy as np
 import numbers
-from random import sample
 import os
+from random import sample
 
-from gensim import interfaces, utils, matutils
-from gensim.matutils import dirichlet_expectation
-from gensim.models import basemodel
-from gensim.matutils import kullback_leibler, hellinger, jaccard_distance
-
-from itertools import chain
+import numpy as np
+import six
 from scipy.special import gammaln, psi  # gamma function utils
 from scipy.special import polygamma
 from six.moves import xrange
-import six
+
+from gensim import interfaces, utils, matutils
+from gensim.matutils import dirichlet_expectation
+from gensim.matutils import kullback_leibler, hellinger, jaccard_distance
+from gensim.models import basemodel, CoherenceModel
 
 # log(sum(exp(x))) that tries to avoid overflow
 try:
@@ -815,6 +814,14 @@ def show_topic(self, topicid, topn=10):
         """
         return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]
 
+    def get_topics(self):
+        """
+        Return the term topic matrix learned during inference.
+        This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
+        """
+        topics = self.state.get_lambda()
+        return topics / topics.sum(axis=1)[:, None]
+
     def get_topic_terms(self, topicid, topn=10):
         """
         Return a list of `(word_id, probability)` 2-tuples for the most
@@ -823,72 +830,32 @@ def get_topic_terms(self, topicid, topn=10):
         Only return 2-tuples for the topn most probable words (ignore the rest).
 
         """
-        topic = self.state.get_lambda()[topicid]
+        topic = self.get_topics()[topicid]
         topic = topic / topic.sum()  # normalize to probability distribution
         bestn = matutils.argsort(topic, topn, reverse=True)
         return [(id, topic[id]) for id in bestn]
 
-    def top_topics(self, corpus, num_words=20):
+    def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
+                   coherence='u_mass', topn=20, processes=-1):
         """
-        Calculate the Umass topic coherence for each topic. Algorithm from
-        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
+        Calculate the coherence for each topic; default is Umass coherence. See the
+        `gensim.models.CoherenceModel` for more info on the parameters and the different
+        coherence metrics.
         """
-        is_corpus, corpus = utils.is_corpus(corpus)
-        if not is_corpus:
-            logger.warning("LdaModel.top_topics() called with an empty corpus")
-            return
+        cm = CoherenceModel(
+            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
+            window_size=window_size, coherence=coherence, topn=topn,
+            processes=processes)
+        coherence_scores = cm.get_coherence_per_topic()
 
-        topics = []
         str_topics = []
-        for topic in self.state.get_lambda():
-            topic = topic / topic.sum()  # normalize to probability distribution
-            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
-            topics.append(bestn)
-            beststr = [(topic[id], self.id2word[id]) for id in bestn]
+        for topic in self.get_topics():
+            bestn = matutils.argsort(topic, topn=topn, reverse=True)
+            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]
             str_topics.append(beststr)
 
-        # top_ids are limited to every topics top words. should not exceed the
-        # vocabulary size.
-        top_ids = set(chain.from_iterable(topics))
-
-        # create a document occurence sparse matrix for each word
-        doc_word_list = {}
-        for id in top_ids:
-            id_list = set()
-            for n, document in enumerate(corpus):
-                if id in frozenset(x[0] for x in document):
-                    id_list.add(n)
-
-            doc_word_list[id] = id_list
-
-        coherence_scores = []
-        for t, top_words in enumerate(topics):
-            # Calculate each coherence score C(t, top_words)
-            coherence = 0.0
-            # Sum of top words m=2..M
-            for m in top_words[1:]:
-                # m_docs is v_m^(t)
-                m_docs = doc_word_list[m]
-                m_index = np.where(top_words == m)[0][0]
-
-                # Sum of top words l=1..m
-                # i.e., all words ranked higher than the current word m
-                for l in top_words[:m_index]:
-                    # l_docs is v_l^(t)
-                    l_docs = doc_word_list[l]
-
-                    # make sure this word appears in some documents.
-                    if len(l_docs) > 0:
-                        # co_doc_frequency is D(v_m^(t), v_l^(t))
-                        co_doc_frequency = len(m_docs.intersection(l_docs))
-
-                        # add to the coherence sum for these two words m, l
-                        coherence += np.log((co_doc_frequency + 1.0) / len(l_docs))
-
-            coherence_scores.append((str_topics[t], coherence))
-
-        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
-        return top_topics
+        scored_topics = zip(str_topics, coherence_scores)
+        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
 
     def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
         """
@@ -1007,7 +974,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
             raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__))
 
         distance_func = distances[distance]
-        d1, d2 = self.state.get_lambda(), other.state.get_lambda()
+        d1, d2 = self.get_topics(), other.get_topics()
         t1_size, t2_size = d1.shape[0], d2.shape[0]
 
         fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)]

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -57,13 +57,11 @@
 import scipy.linalg
 import scipy.sparse
 from scipy.sparse import sparsetools
-
-from gensim import interfaces, matutils, utils
-from gensim.models import basemodel
-
 from six import iterkeys
 from six.moves import xrange
 
+from gensim import interfaces, matutils, utils
+from gensim.models import basemodel
 
 logger = logging.getLogger(__name__)
 
@@ -470,6 +468,24 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
             result = matutils.Dense2Corpus(topic_dist)
         return result
 
+    def get_topics(self):
+        """
+        Return the term topic matrix learned during inference.
+        This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
+
+        NOTE: The number of topics can actually be smaller than `self.num_topics`,
+        if there were not enough factors (real rank of input matrix smaller than
+        `self.num_topics`).
+        """
+        projections = self.projection.u.T
+        num_topics = len(projections)
+        topics = []
+        for i in range(num_topics):
+            c = np.asarray(projections[i, :]).flatten()
+            norm = np.sqrt(np.sum(np.dot(c, c)))
+            topics.append(1.0 * c / norm)
+        return np.array(topics)
+
     def show_topic(self, topicno, topn=10):
         """
         Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`,

diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py
@@ -30,22 +30,19 @@
 
 
 import logging
+import os
 import random
 import tempfile
-import os
-
-import numpy
-
 import xml.etree.ElementTree as et
 import zipfile
 
-from six import iteritems
+import numpy
 from smart_open import smart_open
 
 from gensim import utils, matutils
-from gensim.utils import check_output, revdict
-from gensim.models.ldamodel import LdaModel
 from gensim.models import basemodel
+from gensim.models.ldamodel import LdaModel
+from gensim.utils import check_output, revdict
 
 logger = logging.getLogger(__name__)
 
@@ -213,6 +210,14 @@ def load_document_topics(self):
         """
         return self.read_doctopics(self.fdoctopics())
 
+    def get_topics(self):
+        """
+        Return the term topic matrix learned during inference.
+        This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
+        """
+        topics = self.word_topics
+        return topics / topics.sum(axis=1)[:, None]
+
     def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
         """
         Print the `num_words` most probable words for `num_topics` number of topics.

diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py
@@ -53,15 +53,15 @@
 .. [2] http://www.cs.princeton.edu/~mdhoffma/
 """
 
-from __future__ import unicode_literals
-from __future__ import print_function
 from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
 
-import os
 import logging
-import tempfile
+import os
 import shutil
 import subprocess
+import tempfile
 
 import numpy
 
@@ -235,6 +235,14 @@ def log_perplexity(self, chunk):
                  corpus_words)
         return bound
 
+    def get_topics(self):
+        """
+        Return the term topic matrix learned during inference.
+        This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
+        """
+        topics = self._get_topics()
+        return topics / topics.sum(axis=1)[:, None]
+
     def print_topics(self, num_topics=10, num_words=10):
         return self.show_topics(num_topics, num_words, log=True)
 

diff --git a/gensim/test/basetests.py b/gensim/test/basetests.py
@@ -8,8 +8,9 @@
 Automated tests for checking transformation algorithms (the models package).
 """
 
-import six
 import numpy as np
+import six
+
 
 class TestBaseTopicModel(object):
     def testPrintTopic(self):
@@ -41,3 +42,12 @@ def testShowTopics(self):
             for k, v in topic:
                 self.assertTrue(isinstance(k, six.string_types))
                 self.assertTrue(isinstance(v, (np.floating, float)))
+
+    def testGetTopics(self):
+        topics = self.model.get_topics()
+        vocab_size = len(self.model.id2word)
+        for topic in topics:
+            self.assertTrue(isinstance(topic, np.ndarray))
+            self.assertEqual(topic.dtype, np.float64)
+            self.assertEqual(vocab_size, topic.shape[0])
+            self.assertAlmostEqual(np.sum(topic), 1.0, 5)