Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use CoherenceModel for LdaModel.top_topics #1427

Merged
merged 4 commits into from
Aug 18, 2017
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 9 additions & 19 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@

from gensim import interfaces
from gensim.matutils import argsort
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
from gensim.topic_coherence import (segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation)
Expand Down Expand Up @@ -261,23 +259,15 @@ def _topics_differ(self, new_topics):

def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
topics = []
if isinstance(self.model, LdaModel):
for topic in self.model.state.get_lambda():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaVowpalWabbit):
for topic in self.model._get_topics():
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaMallet):
for topic in self.model.word_topics:
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
else:
raise ValueError("This topic model is not currently supported. Supported topic models "
" are LdaModel, LdaVowpalWabbit and LdaMallet.")
return topics
try:
return [
argsort(topic, topn=self.topn, reverse=True) for topic in
self.model.get_topics()
]
except AttributeError:
raise ValueError(
"This topic model is not currently supported. Supported topic models"
" should implement the `get_topics` method.")

def segment_topics(self):
return self.measure.seg(self.topics)
Expand Down
13 changes: 11 additions & 2 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@
import logging
import time
import warnings

import numpy as np
from scipy.special import gammaln, psi # gamma function utils
from six.moves import xrange

from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation
from gensim.models import basemodel, ldamodel
from six.moves import xrange

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -456,6 +457,14 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No
hdp_formatter = HdpTopicFormatter(self.id2word, betas)
return hdp_formatter.show_topic(topic_id, topn, log, formatted)

def get_topics(self):
"""
Return the term topic matrix learned during inference.
This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
"""
topics = self.m_lambda + self.m_eta
return topics / topics.sum(axis=1)[:, None]

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `num_topics` number of topics.
Expand Down Expand Up @@ -642,7 +651,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words= N
logger.info(topic)
else:
topic = (topic_id, topic_terms)

# we only return the topic_terms
return topic[1]

Expand Down
99 changes: 33 additions & 66 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,20 @@


import logging
import numpy as np
import numbers
from random import sample
import os
from random import sample

from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation
from gensim.models import basemodel
from gensim.matutils import kullback_leibler, hellinger, jaccard_distance

from itertools import chain
import numpy as np
import six
from scipy.special import gammaln, psi # gamma function utils
from scipy.special import polygamma
from six.moves import xrange
import six

from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation
from gensim.matutils import kullback_leibler, hellinger, jaccard_distance
from gensim.models import basemodel, CoherenceModel

# log(sum(exp(x))) that tries to avoid overflow
try:
Expand Down Expand Up @@ -815,6 +814,14 @@ def show_topic(self, topicid, topn=10):
"""
return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]

def get_topics(self):
"""
Return the term topic matrix learned during inference.
This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
"""
topics = self.state.get_lambda()
return topics / topics.sum(axis=1)[:, None]

def get_topic_terms(self, topicid, topn=10):
"""
Return a list of `(word_id, probability)` 2-tuples for the most
Expand All @@ -823,72 +830,32 @@ def get_topic_terms(self, topicid, topn=10):
Only return 2-tuples for the topn most probable words (ignore the rest).

"""
topic = self.state.get_lambda()[topicid]
topic = self.get_topics()[topicid]
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn, reverse=True)
return [(id, topic[id]) for id in bestn]

def top_topics(self, corpus, num_words=20):
def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
coherence='u_mass', topn=20, processes=-1):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that we should use all coherence types for this method, u_mass will be enough for it (for this reason please remove coherence parameters from arguments)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Several papers have shown that sliding-window based coherence metrics have higher correlation with human judgements, and u_mass has lower correlations with human judgements than these methods. Why limit to u_mass when it is not the best technique for coherence calculations?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you, but for "sliding windows" approach user should pass corpus, texts and dictionary, it's slightly complicated.

Although perhaps you are right, if the default method is UMass I see no difference for users.

"""
Calculate the Umass topic coherence for each topic. Algorithm from
**Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
Calculate the coherence for each topic; default is Umass coherence. See the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Googe-style docstring (here and everywhere)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you're looking for here, though I've noticed this comment in several of my PRs. To clarify, are you asking for the beginning line to start on the same line as the start quotes (""")? Or are you asking that I include the Args specification for this method and the others? Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the confusion, I mean Args and Returns sections.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed throughout

`gensim.models.CoherenceModel` for more info on the parameters and the different
coherence metrics.
"""
is_corpus, corpus = utils.is_corpus(corpus)
if not is_corpus:
logger.warning("LdaModel.top_topics() called with an empty corpus")
return
cm = CoherenceModel(
model=self, corpus=corpus, texts=texts, dictionary=dictionary,
window_size=window_size, coherence=coherence, topn=topn,
processes=processes)
coherence_scores = cm.get_coherence_per_topic()

topics = []
str_topics = []
for topic in self.state.get_lambda():
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn=num_words, reverse=True)
topics.append(bestn)
beststr = [(topic[id], self.id2word[id]) for id in bestn]
for topic in self.get_topics():
bestn = matutils.argsort(topic, topn=topn, reverse=True)
beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]
str_topics.append(beststr)

# top_ids are limited to every topics top words. should not exceed the
# vocabulary size.
top_ids = set(chain.from_iterable(topics))

# create a document occurence sparse matrix for each word
doc_word_list = {}
for id in top_ids:
id_list = set()
for n, document in enumerate(corpus):
if id in frozenset(x[0] for x in document):
id_list.add(n)

doc_word_list[id] = id_list

coherence_scores = []
for t, top_words in enumerate(topics):
# Calculate each coherence score C(t, top_words)
coherence = 0.0
# Sum of top words m=2..M
for m in top_words[1:]:
# m_docs is v_m^(t)
m_docs = doc_word_list[m]
m_index = np.where(top_words == m)[0][0]

# Sum of top words l=1..m
# i.e., all words ranked higher than the current word m
for l in top_words[:m_index]:
# l_docs is v_l^(t)
l_docs = doc_word_list[l]

# make sure this word appears in some documents.
if len(l_docs) > 0:
# co_doc_frequency is D(v_m^(t), v_l^(t))
co_doc_frequency = len(m_docs.intersection(l_docs))

# add to the coherence sum for these two words m, l
coherence += np.log((co_doc_frequency + 1.0) / len(l_docs))

coherence_scores.append((str_topics[t], coherence))

top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
return top_topics
scored_topics = zip(str_topics, coherence_scores)
return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)

def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
"""
Expand Down Expand Up @@ -1007,7 +974,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10,
raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__))

distance_func = distances[distance]
d1, d2 = self.state.get_lambda(), other.state.get_lambda()
d1, d2 = self.get_topics(), other.get_topics()
t1_size, t2_size = d1.shape[0], d2.shape[0]

fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)]
Expand Down
24 changes: 20 additions & 4 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,11 @@
import scipy.linalg
import scipy.sparse
from scipy.sparse import sparsetools

from gensim import interfaces, matutils, utils
from gensim.models import basemodel

from six import iterkeys
from six.moves import xrange

from gensim import interfaces, matutils, utils
from gensim.models import basemodel

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -470,6 +468,24 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
result = matutils.Dense2Corpus(topic_dist)
return result

def get_topics(self):
"""
Return the term topic matrix learned during inference.
This is a `num_topics` x `vocabulary_size` np.ndarray of floats.

NOTE: The number of topics can actually be smaller than `self.num_topics`,
if there were not enough factors (real rank of input matrix smaller than
`self.num_topics`).
"""
projections = self.projection.u.T
num_topics = len(projections)
topics = []
for i in range(num_topics):
c = np.asarray(projections[i, :]).flatten()
norm = np.sqrt(np.sum(np.dot(c, c)))
topics.append(1.0 * c / norm)
return np.array(topics)

def show_topic(self, topicno, topn=10):
"""
Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`,
Expand Down
19 changes: 12 additions & 7 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,19 @@


import logging
import os
import random
import tempfile
import os

import numpy

import xml.etree.ElementTree as et
import zipfile

from six import iteritems
import numpy
from smart_open import smart_open

from gensim import utils, matutils
from gensim.utils import check_output, revdict
from gensim.models.ldamodel import LdaModel
from gensim.models import basemodel
from gensim.models.ldamodel import LdaModel
from gensim.utils import check_output, revdict

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -213,6 +210,14 @@ def load_document_topics(self):
"""
return self.read_doctopics(self.fdoctopics())

def get_topics(self):
"""
Return the term topic matrix learned during inference.
This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
"""
topics = self.word_topics
return topics / topics.sum(axis=1)[:, None]

def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
"""
Print the `num_words` most probable words for `num_topics` number of topics.
Expand Down
16 changes: 12 additions & 4 deletions gensim/models/wrappers/ldavowpalwabbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@
.. [2] http://www.cs.princeton.edu/~mdhoffma/
"""

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import logging
import tempfile
import os
import shutil
import subprocess
import tempfile

import numpy

Expand Down Expand Up @@ -235,6 +235,14 @@ def log_perplexity(self, chunk):
corpus_words)
return bound

def get_topics(self):
"""
Return the term topic matrix learned during inference.
This is a `num_topics` x `vocabulary_size` np.ndarray of floats.
"""
topics = self._get_topics()
return topics / topics.sum(axis=1)[:, None]

def print_topics(self, num_topics=10, num_words=10):
return self.show_topics(num_topics, num_words, log=True)

Expand Down
12 changes: 11 additions & 1 deletion gensim/test/basetests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
Automated tests for checking transformation algorithms (the models package).
"""

import six
import numpy as np
import six


class TestBaseTopicModel(object):
def testPrintTopic(self):
Expand Down Expand Up @@ -41,3 +42,12 @@ def testShowTopics(self):
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, (np.floating, float)))

def testGetTopics(self):
topics = self.model.get_topics()
vocab_size = len(self.model.id2word)
for topic in topics:
self.assertTrue(isinstance(topic, np.ndarray))
self.assertEqual(topic.dtype, np.float64)
self.assertEqual(vocab_size, topic.shape[0])
self.assertAlmostEqual(np.sum(topic), 1.0, 5)
Loading