diff --git a/CHANGELOG.md b/CHANGELOG.md index 4582d7ca57..8912969a5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ Changes * Implemented LsiModel.docs_processed attribute * Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750) +* Added 'c_uci', 'c_npmi' coherence measures. Added window_size parameter to CoherenceModel init. Improved backtracking for + indirect confirmation. (@dsquareindia) 0.13.1, 2016-06-22 diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb new file mode 100644 index 0000000000..e38576f257 --- /dev/null +++ b/docs/notebooks/topic_coherence-movies.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmark testing of coherence pipeline on Movies dataset:\n", + "## How to find how well coherence measure matches your manual annotators" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Introduction__: For the validation of any model adapted from a paper, it is of utmost importance that the results of benchmark testing on the datasets listed in the paper match between the actual implementation (palmetto) and gensim. This coherence pipeline has been implemented from the work done by Roeder et al. The paper can be found [here](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf).\n", + "\n", + "__Approach__ :\n", + "1. We will use the Movies dataset first. This dataset along with the topics on which the coherence is calculated and the gold (human) ratings on these topics can be found [here](http://139.18.2.164/mroeder/palmetto/datasets/).\n", + "2. We will then calculate the coherence on these topics using the pipeline implemented in gensim.\n", + "3. Once we have got all our coherence values on these topics we will calculate the correlation with the human ratings using pearson's r.\n", + "4. We will compare this final correlation value with the values listed in the paper and see if the pipeline is working as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The line_profiler extension is already loaded. To reload it, use:\n", + " %reload_ext line_profiler\n" + ] + } + ], + "source": [ + "import re\n", + "import os\n", + "\n", + "from scipy.stats import pearsonr\n", + "from datetime import datetime\n", + "\n", + "from gensim.models import CoherenceModel\n", + "from gensim.corpora.dictionary import Dictionary\n", + "%load_ext line_profiler # This was used for finding out which line was taking maximum time for indirect confirmation measure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the dataset from the link and plug in the location here" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "prefix = \"/home/devashish/datasets/Movies/movie/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0:10:23.956500\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "texts = []\n", + "for fil in os.listdir(prefix):\n", + " for line in open(prefix + fil):\n", + " # lower case all words\n", + " lowered = line.lower()\n", + " #remove punctuation and split into seperate words\n", + " words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n", + " texts.append(words)\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0:01:44.047829\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "dictionary = Dictionary(texts)\n", + "corpus = [dictionary.doc2bow(text) for text in texts]\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross validate the numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "According to the paper the number of documents should be 108952 with a vocabulary of 1625124. The difference is because of a difference in preprocessing. However the results obtained are still very similar." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "124234\n", + "Dictionary(758123 unique tokens: [u'schelberger', u'mdbg', u'shatzky', u'bhetan', u'verplank']...)\n" + ] + } + ], + "source": [ + "print len(corpus)\n", + "print dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[]]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topics = [] # list of 100 topics\n", + "for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):\n", + " topics.append([l.split()])\n", + "topics.pop(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "human_scores = []\n", + "for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):\n", + " human_scores.append(float(l.strip()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Start off with u_mass coherence measure." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0:20:44.833342\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "u_mass = []\n", + "flags = []\n", + "for n, topic in enumerate(topics):\n", + " try:\n", + " cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n", + " u_mass.append(cm.get_coherence())\n", + " except KeyError:\n", + " flags.append(n)\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Start c_v coherence measure\n", + "This is expected to take much more time since `c_v` uses a sliding window to perform probability estimation and uses the cosine similarity indirect confirmation measure." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 19:50:11.214341\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "c_v = []\n", + "for n, topic in enumerate(topics):\n", + " try:\n", + " cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n", + " c_v.append(cm.get_coherence())\n", + " except KeyError:\n", + " pass\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Start c_uci and c_npmi coherence measures\n", + "They should be taking lesser time than c_v but should have a higher correlation than u_mass" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 2:55:36.044760\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "c_uci = []\n", + "flags = []\n", + "for n, topic in enumerate(topics):\n", + " try:\n", + " cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_uci')\n", + " c_uci.append(cm.get_coherence())\n", + " except KeyError:\n", + " flags.append(n)\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 2:53:55.424213\n" + ] + } + ], + "source": [ + "start = datetime.now()\n", + "c_npmi = []\n", + "for n, topic in enumerate(topics):\n", + " print n\n", + " try:\n", + " cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_npmi')\n", + " c_npmi.append(cm.get_coherence())\n", + " except KeyError:\n", + " pass\n", + "end = datetime.now()\n", + "print \"Time taken: %s\" % (end - start)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "final_scores = []\n", + "for n, score in enumerate(human_scores):\n", + " if n not in flags:\n", + " final_scores.append(score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One topic encountered a KeyError. This was because of a difference in preprocessing due to which one topic word wasn't found in the dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "99 99 99 99 99\n" + ] + } + ], + "source": [ + "print len(u_mass), len(c_v), len(c_uci), len(c_npmi), len(final_scores)\n", + "# 1 topic has word(s) that is not in the dictionary. Probably some difference\n", + "# in preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The values in the paper were:\n", + "\n", + "__`u_mass` correlation__ : 0.093\n", + "\n", + "__`c_v` correlation__ : 0.548\n", + "\n", + "__`c_uci` correlation__ : 0.473\n", + "\n", + "__`c_npmi` correlation__ : 0.438\n", + "\n", + "Our values are also very similar to these values which is good. This validates the correctness of our pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.133916622716\n", + "0.555948711374\n", + "0.414722858726\n", + "0.39935634517\n" + ] + } + ], + "source": [ + "print pearsonr(u_mass, final_scores)[0]\n", + "print pearsonr(c_v, final_scores)[0]\n", + "print pearsonr(c_uci, final_scores)[0]\n", + "print pearsonr(c_npmi, final_scores)[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Where do we go now?\n", + "\n", + "- Preprocessing can be improved for this notebook by following the exact process mentioned in [this](http://arxiv.org/pdf/1403.6397v1.pdf) paper.\n", + "- The time required for completing all of these operations can be improved a lot by cythonising the operations." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 615e4efacc..69f7ba8238 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -31,8 +31,38 @@ import numpy as np +from collections import namedtuple + logger = logging.getLogger(__name__) +boolean_document_based = ['u_mass'] +sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] +make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') + +coherence_dict = { + 'u_mass': make_pipeline(segmentation.s_one_pre, + probability_estimation.p_boolean_document, + direct_confirmation_measure.log_conditional_probability, + aggregation.arithmetic_mean), + 'c_v': make_pipeline(segmentation.s_one_set, + probability_estimation.p_boolean_sliding_window, + indirect_confirmation_measure.cosine_similarity, + aggregation.arithmetic_mean), + 'c_uci': make_pipeline(segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), + 'c_npmi': make_pipeline(segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), +} + +sliding_windows_dict = { + 'c_v': 110, + 'c_uci': 10, + 'c_npmi': 10 +} class CoherenceModel(interfaces.TransformationABC): """ @@ -57,7 +87,7 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10): """ Args: ---- @@ -69,11 +99,20 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= corpus : Gensim document corpus. dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. If both are provided, dictionary will be used. + window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their + probability estimator. For 'u_mass' this doesn't matter. + If left 'None' the default window sizes are used which are: + 'c_v' : 110 + 'c_uci' : 10 + 'c_npmi' : 10 coherence : Coherence measure to be used. Supported values are: 'u_mass' 'c_v' + 'c_uci' also popularly known as c_pmi + 'c_npmi' For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. - For 'c_v' texts should be provided. Corpus is not needed. + For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. + topn : Integer corresponding to the number of top words to be extracted from each topic. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") @@ -91,7 +130,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= else: self.dictionary = dictionary # Check for correct inputs for u_mass coherence measure. - if coherence == 'u_mass': + if coherence in boolean_document_based: if is_corpus(corpus)[0]: self.corpus = corpus elif texts is not None: @@ -100,14 +139,15 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= else: raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) # Check for correct inputs for c_v coherence measure. - elif coherence == 'c_v': + elif coherence in sliding_window_based: + self.window_size = window_size if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: self.texts = texts else: raise ValueError("%s coherence is not currently supported." % coherence) - + self.topn = topn self.model = model if model is not None: self.topics = self._get_topics() @@ -115,41 +155,28 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.topics = [] for topic in topics: t_i = [] - for t in range(len(topic)): - t_i.append(dictionary.token2id[topic[t]]) + for n, _ in enumerate(topic): + t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) self.coherence = coherence - # Set pipeline parameters: - if self.coherence == 'u_mass': - self.seg = segmentation.s_one_pre - self.prob = probability_estimation.p_boolean_document - self.conf = direct_confirmation_measure.log_conditional_probability - self.aggr = aggregation.arithmetic_mean - - elif self.coherence == 'c_v': - self.seg = segmentation.s_one_set - self.prob = probability_estimation.p_boolean_sliding_window - self.conf = indirect_confirmation_measure.cosine_similarity - self.aggr = aggregation.arithmetic_mean def __str__(self): - return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % ( - self.seg, self.prob, self.conf, self.aggr) + return coherence_dict[self.coherence].__str__() def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaVowpalWabbit): for topic in self.model._get_topics(): - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaMallet): for topic in self.model.word_topics: - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: raise ValueError("This topic model is not currently supported. Supported topic models are" @@ -157,15 +184,26 @@ def _get_topics(self): return topics def get_coherence(self): - if self.coherence == 'u_mass': - segmented_topics = self.seg(self.topics) - per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics) - confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs) - return self.aggr(confirmed_measures) - - elif self.coherence == 'c_v': - segmented_topics = self.seg(self.topics) - per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=2) # FIXME : Change window size to 110 finally. - confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) - return self.aggr(confirmed_measures) + """ + Return coherence value based on pipeline parameters. + """ + measure = coherence_dict[self.coherence] + segmented_topics = measure.seg(self.topics) + if self.coherence in boolean_document_based: + per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) + confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs) + elif self.coherence in sliding_window_based: + if self.window_size is not None: + self.window_size = sliding_windows_dict[self.coherence] + per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size) + if self.coherence == 'c_v': + confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + else: + if self.coherence == 'c_npmi': + normalize = True + else: + # For c_uci + normalize = False + confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + return measure.aggr(confirmed_measures) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 057f73d01d..3961f67180 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -35,12 +35,24 @@ ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] +boolean_document_based = ['u_mass'] +sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +def checkCoherenceMeasure(topics1, topics2, coherence): + """Check provided topic coherence algorithm on given topics""" + if coherence in boolean_document_based: + cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence) + cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence) + else: + cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence) + cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence) + return cm1.get_coherence() > cm2.get_coherence() + class TestCoherenceModel(unittest.TestCase): def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. @@ -67,18 +79,25 @@ def setUp(self): def testUMass(self): """Test U_Mass topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') - cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass')) def testCv(self): """Test C_v topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v') - cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v')) + + def testCuci(self): + """Test C_uci topic coherence algorithm on given topics""" + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci')) + + def testCnpmi(self): + """Test C_npmi topic coherence algorithm on given topics""" + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi')) def testUMassLdaModel(self): """Perform sanity check to see if u_mass coherence works with LDA Model""" + # Note that this is just a sanity check because LDA does not guarantee a better coherence + # value on the topics if iterations are increased. This can be seen here: + # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde try: cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass') except: @@ -91,6 +110,20 @@ def testCvLdaModel(self): except: raise + def testCuciLdaModel(self): + """Perform sanity check to see if c_uci coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_uci') + except: + raise + + def testCnpmiLdaModel(self): + """Perform sanity check to see if c_npmi coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_npmi') + except: + raise + def testUMassMalletModel(self): """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper""" if not self.mallet_path: @@ -109,6 +142,24 @@ def testCvMalletModel(self): except: raise + def testCuciMalletModel(self): + """Perform sanity check to see if c_uci coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_uci') + except: + raise + + def testCnpmiMalletModel(self): + """Perform sanity check to see if c_npmi coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_npmi') + except: + raise + def testUMassVWModel(self): """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper""" if not self.vw_path: @@ -127,6 +178,24 @@ def testCvVWModel(self): except: raise + def testCuciVWModel(self): + """Perform sanity check to see if c_uci coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_uci') + except: + raise + + def testCnpmiVWModel(self): + """Perform sanity check to see if c_npmi coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_npmi') + except: + raise + def testErrors(self): """Test if errors are raised on bad input""" # not providing dictionary diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py index c3c57dd0fd..cb35f0acc4 100644 --- a/gensim/test/test_direct_confirmation.py +++ b/gensim/test/test_direct_confirmation.py @@ -37,9 +37,9 @@ def testLogRatioMeasure(self): def testNormalizedLogRatioMeasure(self): """Test normalized_log_ratio_measure()""" - obtained = direct_confirmation_measure.normalized_log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0] - # Answer should be ~ -0.182321557 / ln(1 / 5) = 0.113282753 - expected = 0.113282753 + obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0] + # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753 + expected = -0.113282753 self.assertAlmostEqual(obtained, expected) if __name__ == '__main__': diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py index d44fce350d..c437ca8fa8 100644 --- a/gensim/test/test_segmentation.py +++ b/gensim/test/test_segmentation.py @@ -30,6 +30,14 @@ def testSOnePre(self): [(2, 5), (7, 5), (7, 2)]] self.assertTrue(np.allclose(actual, expected)) + def testSOneOne(self): + """Test s_one_one segmentation.""" + actual = segmentation.s_one_one(self.topics) + expected = [[(9, 4), (9, 6), (4, 9), (4, 6), (6, 9), (6, 4)], + [(9, 10), (9, 7), (10, 9), (10, 7), (7, 9), (7, 10)], + [(5, 2), (5, 7), (2, 5), (2, 7), (7, 5), (7, 2)]] + self.assertTrue(np.allclose(actual, expected)) + def testSOneSet(self): """Test s_one_set segmentation.""" actual = segmentation.s_one_set(self.topics) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 9a783a472a..83227822e9 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -42,12 +42,18 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): return m_lc -def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): +def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False): """ - Popularly known as PMI. - This function calculates the log-ratio-measure which is used by - coherence measures such as c_v. - This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + If normalize=False: + Popularly known as PMI. + This function calculates the log-ratio-measure which is used by + coherence measures such as c_v. + This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + + If normalize=True: + This function calculates the normalized-log-ratio-measure, popularly knowns as + NPMI which is used by coherence measures such as c_v. + This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] Args: ---- @@ -65,38 +71,16 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): w_prime_docs = per_topic_postings[w_prime] w_star_docs = per_topic_postings[w_star] co_docs = w_prime_docs.intersection(w_star_docs) - numerator = (len(co_docs) / float(num_docs)) + EPSILON - denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) - m_lr_i = np.log(numerator / denominator) + if normalize: + # For normalized log ratio measure + numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] + co_doc_prob = len(co_docs) / float(num_docs) + m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON)) + else: + # For log ratio measure without normalization + numerator = (len(co_docs) / float(num_docs)) + EPSILON + denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) + m_lr_i = np.log(numerator / denominator) m_lr.append(m_lr_i) return m_lr - -def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs): - """ - This function calculates the normalized-log-ratio-measure, popularly knowns as - NPMI which is used by coherence measures such as c_v. - This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - - Args: - ---- - segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics - num_docs : Total number of documents in corpus. Used for calculating probability. - - Returns: - ------- - m_nlr : List of log ratio measures on each set in segmented topics. - """ - m_nlr = [] - for s_i in segmented_topics: - for w_prime, w_star in s_i: - numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] - w_prime_docs = per_topic_postings[w_prime] - w_star_docs = per_topic_postings[w_star] - co_docs = w_prime_docs.intersection(w_star_docs) - co_doc_prob = len(co_docs) / float(num_docs) - m_nlr_i = numerator / np.log(co_doc_prob + EPSILON) - m_nlr.append(m_nlr_i) - - return m_nlr diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 1af0dae8e8..c68206a372 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -5,15 +5,20 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -This module contains functions to compute confirmation on a pair of words or word subsets. +This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect +confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words. +Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are +seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road” +or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation +measures may capture semantic support that direct measures would miss. The formula used to compute indirect confirmation measure is: - _ _ -m_sim(m, gamma)(W', W*) = s_sim(V_m,gamma(W'), V_m,gamma(W*)) + +m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) where s_sim can be cosine, dice or jaccard similarity and -_ -V_m,gamma(W') = {sigma(w' belonging to W') m(w_i, w_j) ^ gamma} where j = 1, ...., |W| + +\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Here 'm' is the direct confirmation measure used. """ @@ -26,6 +31,23 @@ logger = logging.getLogger(__name__) + +def _present(w_prime_star, w, w_backtrack): + """ + Internal helper function to return index of (w_prime_star, w) in w_backtrack. + Return -1 if not present. + """ + index = -1 + flag = 0 + for arr in w_backtrack: + index += 1 + if np.all(w_prime_star == arr[0]) and np.all(w == arr[1]): + flag += 1 + break + if not flag: + return -1 + return index + def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs): """ Internal helper function to return context vectors for segmentations. @@ -35,7 +57,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc for w_j in w: for w_i in w_prime: if (w_i, w_j) not in backtrack: - backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0] + backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0] if w_j not in context_vectors: context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma else: @@ -43,7 +65,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc else: for w_j in w: if (w_prime, w_j) not in backtrack: - backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0] + backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0] context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma return (context_vectors, backtrack) @@ -53,7 +75,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect _ _ - cosine measure is computed as the cosine similarity between u and w. + cosine measure is computed as the cosine similarity between u and w. The formula used is: + + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + + where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: ---- @@ -69,17 +95,45 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ if measure == 'nlr': - measure = direct_confirmation_measure.normalized_log_ratio_measure + # make normalized log ratio measure tuple + measure = (direct_confirmation_measure.log_ratio_measure, True) else: raise ValueError("The direct confirmation measure you entered is not currently supported.") - backtrack = {} + backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2). + """ + For backtracking context vectors, we will create a list called w_backtrack to store (w_prime, w) or + (w_star, w) tuples and a corresponding list context_vector_backtrack which will create a + mapping of (w_prime or w_star, w) ---> context_vector. + """ + w_backtrack = [] + context_vector_backtrack = [] s_cos_sim = [] for top_words, s_i in zip(topics, segmented_topics): for w_prime, w_star in s_i: - w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) - backtrack.update(backtrack_i) - w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) - backtrack.update(backtrack_i) + # Step 1. Check if (w_prime, top_words) tuple in w_backtrack. + # Step 2. If yes, return corresponding context vector + w_prime_index = _present(w_prime, top_words, w_backtrack) + if w_backtrack and w_prime_index != -1: + w_prime_context_vectors = context_vector_backtrack[w_prime_index] + else: + w_prime_context_vectors, backtrack_i = _make_seg(w_prime, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + backtrack.update(backtrack_i) + # Update backtracking lists + w_backtrack.append((w_prime, top_words)) + context_vector_backtrack.append(w_prime_context_vectors) + + # Step 1. Check if (w_star, top_words) tuple in w_backtrack. + # Step 2. If yes, check if corresponding w is the same + w_star_index = _present(w_star, top_words, w_backtrack) + if w_backtrack and w_star_index != -1: + w_star_context_vectors = context_vector_backtrack[w_star_index] + else: + w_star_context_vectors, backtrack_i = _make_seg(w_star, top_words, per_topic_postings, measure, gamma, backtrack, num_docs) + backtrack.update(backtrack_i) + # Update all backtracking lists + w_backtrack.append((w_star, top_words)) + context_vector_backtrack.append(w_star_context_vectors) + s_cos_sim_i = cossim(w_prime_context_vectors.items(), w_star_context_vectors.items()) s_cos_sim.append(s_cos_sim_i) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index e7e931b9ac..a76f40db4c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -78,27 +78,26 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - window_id = [0] # Each window assigned a window id. + window_id = 0 # Each window assigned a window id. per_topic_postings = {} token2id_dict = dictionary.token2id - def add_topic_posting(): + def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict): for word in window: word_id = token2id_dict[word] if word_id in top_ids: if word_id in per_topic_postings: - per_topic_postings[word_id].add(window_id[0]) + per_topic_postings[word_id].add(window_id) else: - per_topic_postings[word_id] = set([window_id[0]]) - window_id[0] += 1 + per_topic_postings[word_id] = set([window_id]) + window_id += 1 + return (window_id, per_topic_postings) # Apply boolean sliding window to each document in texts. for document in texts: it = iter(document) window = tuple(islice(it, window_size)) - add_topic_posting() - if len(window) <= window_size: - pass # FIXME : Handle case when window size is bigger than length of document + window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) for elem in it: window = window[1:] + (elem,) - add_topic_posting() + window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) - return (per_topic_postings, window_id[0]) + return per_topic_postings, window_id diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index de5be5ea91..9a2a58b060 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -36,14 +36,46 @@ def s_one_pre(topics): for top_words in topics: s_one_pre_t = [] - for w_prime in top_words[1:]: - w_prime_index = np.where(top_words == w_prime)[0] # To get index of w_star in top_words - for w_star in top_words[:w_prime_index]: + for w_prime_index, w_prime in enumerate(top_words[1:]): + for w_star in top_words[:w_prime_index + 1]: s_one_pre_t.append((w_prime, w_star)) s_one_pre.append(s_one_pre_t) return s_one_pre +def s_one_one(topics): + """ + This function performs s_one_one segmentation on a list of topics. + s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; + W* = {w_j}; w_i, w_j belongs to W; i != j} + Example: + + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> s_one_pre(topics) + [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] + + Args: + ---- + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + + Returns: + ------- + s_one_one : list of list of (W', W*) tuples for all unique topic ids + """ + s_one_one = [] + + for top_words in topics: + s_one_one_t = [] + for w_prime_index, w_prime in enumerate(top_words): + for w_star_index, w_star in enumerate(top_words): + if w_prime_index == w_star_index: + continue + else: + s_one_one_t.append((w_prime, w_star)) + s_one_one.append(s_one_one_t) + + return s_one_one + def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics.