-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use CoherenceModel
for LdaModel.top_topics
#1427
Changes from 3 commits
41b038a
63bd3f9
f98a7bb
8ef0e9e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,21 +31,20 @@ | |
|
||
|
||
import logging | ||
import numpy as np | ||
import numbers | ||
from random import sample | ||
import os | ||
from random import sample | ||
|
||
from gensim import interfaces, utils, matutils | ||
from gensim.matutils import dirichlet_expectation | ||
from gensim.models import basemodel | ||
from gensim.matutils import kullback_leibler, hellinger, jaccard_distance | ||
|
||
from itertools import chain | ||
import numpy as np | ||
import six | ||
from scipy.special import gammaln, psi # gamma function utils | ||
from scipy.special import polygamma | ||
from six.moves import xrange | ||
import six | ||
|
||
from gensim import interfaces, utils, matutils | ||
from gensim.matutils import dirichlet_expectation | ||
from gensim.matutils import kullback_leibler, hellinger, jaccard_distance | ||
from gensim.models import basemodel, CoherenceModel | ||
|
||
# log(sum(exp(x))) that tries to avoid overflow | ||
try: | ||
|
@@ -815,6 +814,14 @@ def show_topic(self, topicid, topn=10): | |
""" | ||
return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)] | ||
|
||
def get_topics(self): | ||
""" | ||
Return the term topic matrix learned during inference. | ||
This is a `num_topics` x `vocabulary_size` np.ndarray of floats. | ||
""" | ||
topics = self.state.get_lambda() | ||
return topics / topics.sum(axis=1)[:, None] | ||
|
||
def get_topic_terms(self, topicid, topn=10): | ||
""" | ||
Return a list of `(word_id, probability)` 2-tuples for the most | ||
|
@@ -823,72 +830,32 @@ def get_topic_terms(self, topicid, topn=10): | |
Only return 2-tuples for the topn most probable words (ignore the rest). | ||
|
||
""" | ||
topic = self.state.get_lambda()[topicid] | ||
topic = self.get_topics()[topicid] | ||
topic = topic / topic.sum() # normalize to probability distribution | ||
bestn = matutils.argsort(topic, topn, reverse=True) | ||
return [(id, topic[id]) for id in bestn] | ||
|
||
def top_topics(self, corpus, num_words=20): | ||
def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None, | ||
coherence='u_mass', topn=20, processes=-1): | ||
""" | ||
Calculate the Umass topic coherence for each topic. Algorithm from | ||
**Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** | ||
Calculate the coherence for each topic; default is Umass coherence. See the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Googe-style docstring (here and everywhere) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you're looking for here, though I've noticed this comment in several of my PRs. To clarify, are you asking for the beginning line to start on the same line as the start quotes (""")? Or are you asking that I include the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for the confusion, I mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed throughout |
||
`gensim.models.CoherenceModel` for more info on the parameters and the different | ||
coherence metrics. | ||
""" | ||
is_corpus, corpus = utils.is_corpus(corpus) | ||
if not is_corpus: | ||
logger.warning("LdaModel.top_topics() called with an empty corpus") | ||
return | ||
cm = CoherenceModel( | ||
model=self, corpus=corpus, texts=texts, dictionary=dictionary, | ||
window_size=window_size, coherence=coherence, topn=topn, | ||
processes=processes) | ||
coherence_scores = cm.get_coherence_per_topic() | ||
|
||
topics = [] | ||
str_topics = [] | ||
for topic in self.state.get_lambda(): | ||
topic = topic / topic.sum() # normalize to probability distribution | ||
bestn = matutils.argsort(topic, topn=num_words, reverse=True) | ||
topics.append(bestn) | ||
beststr = [(topic[id], self.id2word[id]) for id in bestn] | ||
for topic in self.get_topics(): | ||
bestn = matutils.argsort(topic, topn=topn, reverse=True) | ||
beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] | ||
str_topics.append(beststr) | ||
|
||
# top_ids are limited to every topics top words. should not exceed the | ||
# vocabulary size. | ||
top_ids = set(chain.from_iterable(topics)) | ||
|
||
# create a document occurence sparse matrix for each word | ||
doc_word_list = {} | ||
for id in top_ids: | ||
id_list = set() | ||
for n, document in enumerate(corpus): | ||
if id in frozenset(x[0] for x in document): | ||
id_list.add(n) | ||
|
||
doc_word_list[id] = id_list | ||
|
||
coherence_scores = [] | ||
for t, top_words in enumerate(topics): | ||
# Calculate each coherence score C(t, top_words) | ||
coherence = 0.0 | ||
# Sum of top words m=2..M | ||
for m in top_words[1:]: | ||
# m_docs is v_m^(t) | ||
m_docs = doc_word_list[m] | ||
m_index = np.where(top_words == m)[0][0] | ||
|
||
# Sum of top words l=1..m | ||
# i.e., all words ranked higher than the current word m | ||
for l in top_words[:m_index]: | ||
# l_docs is v_l^(t) | ||
l_docs = doc_word_list[l] | ||
|
||
# make sure this word appears in some documents. | ||
if len(l_docs) > 0: | ||
# co_doc_frequency is D(v_m^(t), v_l^(t)) | ||
co_doc_frequency = len(m_docs.intersection(l_docs)) | ||
|
||
# add to the coherence sum for these two words m, l | ||
coherence += np.log((co_doc_frequency + 1.0) / len(l_docs)) | ||
|
||
coherence_scores.append((str_topics[t], coherence)) | ||
|
||
top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) | ||
return top_topics | ||
scored_topics = zip(str_topics, coherence_scores) | ||
return sorted(scored_topics, key=lambda tup: tup[1], reverse=True) | ||
|
||
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): | ||
""" | ||
|
@@ -1007,7 +974,7 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, | |
raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__)) | ||
|
||
distance_func = distances[distance] | ||
d1, d2 = self.state.get_lambda(), other.state.get_lambda() | ||
d1, d2 = self.get_topics(), other.get_topics() | ||
t1_size, t2_size = d1.shape[0], d2.shape[0] | ||
|
||
fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that we should use all coherence types for this method,
u_mass
will be enough for it (for this reason please remove coherence parameters from arguments)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Several papers have shown that sliding-window based coherence metrics have higher correlation with human judgements, and
u_mass
has lower correlations with human judgements than these methods. Why limit tou_mass
when it is not the best technique for coherence calculations?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with you, but for "sliding windows" approach user should pass
corpus
,texts
anddictionary
, it's slightly complicated.Although perhaps you are right, if the default method is
UMass
I see no difference for users.