From c8b0e7b7fea6efe06306e7a32852a193ad531774 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Sat, 17 Jun 2017 20:23:06 -0400
Subject: [PATCH 1/4] update topic coherence tutorial notebook

---
 docs/notebooks/topic_coherence_tutorial.ipynb | 492 +++++-------------
 1 file changed, 136 insertions(+), 356 deletions(-)

diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb
index ea2cf4ef7e..33c57e728b 100644
--- a/docs/notebooks/topic_coherence_tutorial.ipynb
+++ b/docs/notebooks/topic_coherence_tutorial.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Demonstration of the topic coherence pipeline in Gensim"
+    "# Demonstration of the topic coherence pipeline in Gensim"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Introduction"
+    "## Introduction"
    ]
   },
   {
@@ -23,49 +23,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/vru959/anaconda2/lib/python2.7/site-packages/scipy/sparse/sparsetools.py:20: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!\n",
+      "scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.\n",
+      "  _deprecated()\n"
+     ]
+    }
+   ],
    "source": [
-    "import numpy as np\n",
+    "from __future__ import print_function\n",
+    "\n",
+    "import os\n",
     "import logging\n",
+    "import json\n",
+    "import warnings\n",
+    "\n",
     "try:\n",
     "    import pyLDAvis.gensim\n",
+    "    CAN_VISUALIZE = True\n",
+    "    pyLDAvis.enable_notebook()\n",
+    "    from IPython.display import display\n",
     "except ImportError:\n",
     "    ValueError(\"SKIP: please install pyLDAvis\")\n",
-    "    \n",
-    "import json\n",
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity\n",
+    "    CAN_VISUALIZE = False\n",
     "\n",
-    "from gensim.models.coherencemodel import CoherenceModel\n",
-    "from gensim.models.ldamodel import LdaModel\n",
-    "from gensim.models.hdpmodel import HdpModel\n",
+    "import numpy as np\n",
+    "\n",
+    "from gensim.models import CoherenceModel, LdaModel, HdpModel\n",
     "from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet\n",
-    "from gensim.corpora.dictionary import Dictionary\n",
-    "from numpy import array"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Set up logging"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "logger = logging.getLogger()\n",
-    "logger.setLevel(logging.DEBUG)\n",
-    "logging.debug(\"test\")"
+    "from gensim.corpora import Dictionary\n",
+    "\n",
+    "warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity"
    ]
   },
   {
@@ -84,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "collapsed": true
    },
@@ -103,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
@@ -129,9 +125,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -148,23 +144,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
+    "goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
     "badcm = CoherenceModel(model=badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')"
    ]
   },
@@ -184,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 6,
    "metadata": {
     "collapsed": false
    },
@@ -193,12 +179,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CoherenceModel(segmentation=<function s_one_pre at 0x7f663ae82f50>, probability estimation=<function p_boolean_document at 0x7f663ae8a2a8>, confirmation measure=<function log_conditional_probability at 0x7f663ae8a668>, aggregation=<function arithmetic_mean at 0x7f663ae8aa28>)\n"
+      "Coherence_Measure(seg=<function s_one_pre at 0x11e3216e0>, prob=<function p_boolean_document at 0x11e334230>, conf=<function log_conditional_probability at 0x11e338c08>, aggr=<function arithmetic_mean at 0x11e33d230>)\n"
      ]
     }
    ],
    "source": [
-    "print goodcm"
+    "print(goodcm)"
    ]
   },
   {
@@ -232,18 +218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "pyLDAvis.enable_notebook()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 7,
    "metadata": {
     "collapsed": false
    },
@@ -255,10 +230,10 @@
        "<link rel=\"stylesheet\" type=\"text/css\" href=\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css\">\n",
        "\n",
        "\n",
-       "<div id=\"ldavis_el97211400780301404969092186121\"></div>\n",
+       "<div id=\"ldavis_el4592648534317606780006507\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "\n",
-       "var ldavis_el97211400780301404969092186121_data = {\"plot.opts\": {\"xlab\": \"PC1\", \"ylab\": \"PC2\"}, \"topic.order\": [2, 1], \"token.table\": {\"Topic\": [1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], \"Freq\": [0.47860298324896949, 0.47860298324896949, 0.91234237411332397, 0.36742593082724595, 0.7348518616544919, 0.46586716704724729, 0.46586716704724729, 0.92768662134746982, 0.46384331067373491, 0.47951599225096281, 0.47951599225096281, 0.47068971991734737, 0.47068971991734737, 0.48713296080836122, 0.48713296080836122, 0.82644395095404677, 0.27548131698468226, 0.47093723669544113, 0.47093723669544113, 0.35893273744053222, 0.35893273744053222, 0.69154485611401006, 0.34577242805700503], \"Term\": [\"computer\", \"computer\", \"eps\", \"graph\", \"graph\", \"human\", \"human\", \"interface\", \"interface\", \"minors\", \"minors\", \"response\", \"response\", \"survey\", \"survey\", \"system\", \"system\", \"time\", \"time\", \"trees\", \"trees\", \"user\", \"user\"]}, \"mdsDat\": {\"y\": [-0.0, -0.0], \"cluster\": [1, 1], \"Freq\": [60.467873822949159, 39.532126177050834], \"topics\": [1, 2], \"x\": [-0.021780138119695133, 0.021780138119695133]}, \"R\": 12, \"lambda.step\": 0.01, \"tinfo\": {\"Category\": [\"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\"], \"Term\": [\"graph\", \"survey\", \"trees\", \"minors\", \"computer\", \"eps\", \"time\", \"response\", \"system\", \"user\", \"human\", \"interface\", \"eps\", \"system\", \"user\", \"interface\", \"human\", \"response\", \"time\", \"trees\", \"computer\", \"minors\", \"survey\", \"graph\", \"graph\", \"survey\", \"minors\", \"computer\", \"trees\", \"time\", \"response\", \"human\", \"interface\", \"user\", \"system\", \"eps\"], \"loglift\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.28039999999999998, 0.23119999999999999, 0.19839999999999999, 0.1477, 0.1095, 0.012200000000000001, 0.0070000000000000001, -0.1706, -0.17130000000000001, -0.1948, -0.41599999999999998, -0.51039999999999996, 0.47710000000000002, 0.41909999999999997, 0.23960000000000001, 0.2157, 0.215, -0.010800000000000001, -0.019, -0.19489999999999999, -0.27900000000000003, -0.40910000000000002, -0.50729999999999997, -0.6835], \"Freq\": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 1.75465564679149, 2.7659900500989569, 2.1326462675558417, 1.5111201940472374, 1.4482141989675958, 1.3004987462049202, 1.2929988949829556, 1.4204356927347734, 1.0645643538992473, 1.0378439934863242, 0.81882725859319405, 0.98788811129271803, 1.7337488666650895, 1.2340003684689818, 1.0475921883305965, 1.0248501190560948, 1.3656016220300282, 0.83042631347366114, 0.82404308609938193, 0.69832041590596561, 0.64478025118607762, 0.75942932568705601, 0.86402018415443749, 0.43750385028737271], \"Total\": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.1921594970788627, 3.6300102342533944, 2.8920755932428976, 2.155900445233315, 2.1465346148735613, 2.1245418323043022, 2.1234252084566165, 2.7860373147648017, 2.0894144729553421, 2.0854361818169207, 2.0528276270621757, 2.7216369779578073, 2.7216369779578073, 2.0528276270621757, 2.0854361818169207, 2.0894144729553421, 2.7860373147648017, 2.1234252084566165, 2.1245418323043022, 2.1465346148735613, 2.155900445233315, 2.8920755932428976, 3.6300102342533944, 2.1921594970788627], \"logprob\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -2.302, -1.8468, -2.1069, -2.4514, -2.4939, -2.6015000000000001, -2.6073, -2.5133000000000001, -2.8016999999999999, -2.8271000000000002, -3.0640999999999998, -2.8763999999999998, -1.889, -2.2290000000000001, -2.3927, -2.4146999999999998, -2.1276000000000002, -2.6251000000000002, -2.6328, -2.7982999999999998, -2.8780999999999999, -2.7143999999999999, -2.5853999999999999, -3.2658999999999998]}};\n",
+       "var ldavis_el4592648534317606780006507_data = {\"plot.opts\": {\"xlab\": \"PC1\", \"ylab\": \"PC2\"}, \"topic.order\": [1, 2], \"token.table\": {\"Topic\": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], \"Freq\": [0.4692125100483321, 0.4692125100483321, 0.48036588248303724, 0.48036588248303724, 0.6975749408191356, 0.3487874704095678, 0.4784868656682996, 0.4784868656682996, 0.47341949002228034, 0.47341949002228034, 0.9299670287855416, 0.4649835143927708, 0.4674488829453285, 0.4674488829453285, 0.4673208871941226, 0.4673208871941226, 0.2890062037550016, 0.5780124075100032, 0.4687980245217653, 0.4687980245217653, 0.7014349048045118, 0.3507174524022559, 0.7033719592765316, 0.3516859796382658], \"Term\": [\"computer\", \"computer\", \"eps\", \"eps\", \"graph\", \"graph\", \"human\", \"human\", \"interface\", \"interface\", \"minors\", \"minors\", \"response\", \"response\", \"survey\", \"survey\", \"system\", \"system\", \"time\", \"time\", \"trees\", \"trees\", \"user\", \"user\"]}, \"mdsDat\": {\"y\": [0.0, 0.0], \"cluster\": [1, 1], \"Freq\": [55.73087987945022, 44.26912012054978], \"topics\": [1, 2], \"x\": [0.022556279531108087, -0.022556279531108087]}, \"R\": 12, \"lambda.step\": 0.01, \"tinfo\": {\"Category\": [\"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\"], \"Term\": [\"system\", \"eps\", \"human\", \"graph\", \"minors\", \"interface\", \"survey\", \"trees\", \"response\", \"time\", \"computer\", \"user\", \"minors\", \"graph\", \"survey\", \"response\", \"trees\", \"time\", \"user\", \"computer\", \"interface\", \"human\", \"eps\", \"system\", \"system\", \"eps\", \"human\", \"interface\", \"computer\", \"user\", \"time\", \"trees\", \"response\", \"survey\", \"graph\", \"minors\"], \"loglift\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.2496, 0.2473, 0.1638, 0.1589, 0.1522, 0.1056, 0.1008, 0.0886, -0.1023, -0.3946, -0.5291, -0.635, 0.4648, 0.4169, 0.3439, 0.1155, -0.124, -0.1433, -0.151, -0.2318, -0.2444, -0.2537, -0.4357, -0.4415], \"Freq\": [3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.538381346204438, 2.04608932032542, 1.40485597900046, 1.397582624125938, 1.8502393333971954, 1.3211592567144792, 1.752765521959243, 1.2977685923404727, 1.0626736577737315, 0.7849866961420866, 0.683507496342354, 1.0219453407147447, 2.4381879955499923, 1.3982390149438886, 1.3049348288788902, 1.0496179167724229, 0.8334618812395969, 1.0906802162773837, 0.8119555340744314, 1.001058750431974, 0.7416887197248446, 0.7350012955249248, 0.8209861477977274, 0.612232533743361], \"Total\": [3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.150613879947799, 2.867075468123147, 2.1398572745253848, 2.1392713438507824, 2.8512980838291693, 2.1331147907889108, 2.8434457382366265, 2.13123047358007, 2.1122915745461546, 2.089921525020977, 2.0817465112862426, 3.460133336264737, 3.460133336264737, 2.0817465112862426, 2.089921525020977, 2.1122915745461546, 2.13123047358007, 2.8434457382366265, 2.1331147907889108, 2.8512980838291693, 2.1392713438507824, 2.1398572745253848, 2.867075468123147, 2.150613879947799], \"logprob\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -2.3519, -2.0667, -2.4427, -2.4479, -2.1673, -2.5042, -2.2215, -2.522, -2.7219, -3.0247, -3.1632, -2.761, -1.6612, -2.2172, -2.2863, -2.504, -2.7346, -2.4656, -2.7607, -2.5514, -2.8512, -2.8603, -2.7497, -3.0431]}};\n",
        "\n",
        "function LDAvis_load_lib(url, callback){\n",
        "  var s = document.createElement('script');\n",
@@ -272,7 +247,7 @@
        "if(typeof(LDAvis) !== \"undefined\"){\n",
        "   // already loaded: just create the visualization\n",
        "   !function(LDAvis){\n",
-       "       new LDAvis(\"#\" + \"ldavis_el97211400780301404969092186121\", ldavis_el97211400780301404969092186121_data);\n",
+       "       new LDAvis(\"#\" + \"ldavis_el4592648534317606780006507\", ldavis_el4592648534317606780006507_data);\n",
        "   }(LDAvis);\n",
        "}else if(typeof define === \"function\" && define.amd){\n",
        "   // require.js is available: use it to load d3/LDAvis\n",
@@ -280,99 +255,36 @@
        "   require([\"d3\"], function(d3){\n",
        "      window.d3 = d3;\n",
        "      LDAvis_load_lib(\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js\", function(){\n",
-       "        new LDAvis(\"#\" + \"ldavis_el97211400780301404969092186121\", ldavis_el97211400780301404969092186121_data);\n",
+       "        new LDAvis(\"#\" + \"ldavis_el4592648534317606780006507\", ldavis_el4592648534317606780006507_data);\n",
        "      });\n",
        "    });\n",
        "}else{\n",
        "    // require.js not available: dynamically load d3 & LDAvis\n",
        "    LDAvis_load_lib(\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\", function(){\n",
        "         LDAvis_load_lib(\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js\", function(){\n",
-       "                 new LDAvis(\"#\" + \"ldavis_el97211400780301404969092186121\", ldavis_el97211400780301404969092186121_data);\n",
+       "                 new LDAvis(\"#\" + \"ldavis_el4592648534317606780006507\", ldavis_el4592648534317606780006507_data);\n",
        "            })\n",
        "         });\n",
        "}\n",
        "</script>"
       ],
       "text/plain": [
-       "PreparedData(topic_coordinates=            Freq  cluster  topics        x    y\n",
-       "topic                                          \n",
-       "1      60.467874        1       1 -0.02178 -0.0\n",
-       "0      39.532126        1       2  0.02178 -0.0, topic_info=     Category      Freq       Term     Total  loglift  logprob\n",
-       "term                                                          \n",
-       "1     Default  2.000000      graph  2.000000  12.0000  12.0000\n",
-       "6     Default  2.000000     survey  2.000000  11.0000  11.0000\n",
-       "3     Default  2.000000      trees  2.000000  10.0000  10.0000\n",
-       "0     Default  2.000000     minors  2.000000   9.0000   9.0000\n",
-       "5     Default  2.000000   computer  2.000000   8.0000   8.0000\n",
-       "4     Default  2.000000        eps  2.000000   7.0000   7.0000\n",
-       "9     Default  2.000000       time  2.000000   6.0000   6.0000\n",
-       "11    Default  2.000000   response  2.000000   5.0000   5.0000\n",
-       "2     Default  3.000000     system  3.000000   4.0000   4.0000\n",
-       "7     Default  2.000000       user  2.000000   3.0000   3.0000\n",
-       "8     Default  2.000000      human  2.000000   2.0000   2.0000\n",
-       "10    Default  2.000000  interface  2.000000   1.0000   1.0000\n",
-       "4      Topic1  1.754656        eps  2.192159   0.2804  -2.3020\n",
-       "2      Topic1  2.765990     system  3.630010   0.2312  -1.8468\n",
-       "7      Topic1  2.132646       user  2.892076   0.1984  -2.1069\n",
-       "10     Topic1  1.511120  interface  2.155900   0.1477  -2.4514\n",
-       "8      Topic1  1.448214      human  2.146535   0.1095  -2.4939\n",
-       "11     Topic1  1.300499   response  2.124542   0.0122  -2.6015\n",
-       "9      Topic1  1.292999       time  2.123425   0.0070  -2.6073\n",
-       "3      Topic1  1.420436      trees  2.786037  -0.1706  -2.5133\n",
-       "5      Topic1  1.064564   computer  2.089414  -0.1713  -2.8017\n",
-       "0      Topic1  1.037844     minors  2.085436  -0.1948  -2.8271\n",
-       "6      Topic1  0.818827     survey  2.052828  -0.4160  -3.0641\n",
-       "1      Topic1  0.987888      graph  2.721637  -0.5104  -2.8764\n",
-       "1      Topic2  1.733749      graph  2.721637   0.4771  -1.8890\n",
-       "6      Topic2  1.234000     survey  2.052828   0.4191  -2.2290\n",
-       "0      Topic2  1.047592     minors  2.085436   0.2396  -2.3927\n",
-       "5      Topic2  1.024850   computer  2.089414   0.2157  -2.4147\n",
-       "3      Topic2  1.365602      trees  2.786037   0.2150  -2.1276\n",
-       "9      Topic2  0.830426       time  2.123425  -0.0108  -2.6251\n",
-       "11     Topic2  0.824043   response  2.124542  -0.0190  -2.6328\n",
-       "8      Topic2  0.698320      human  2.146535  -0.1949  -2.7983\n",
-       "10     Topic2  0.644780  interface  2.155900  -0.2790  -2.8781\n",
-       "7      Topic2  0.759429       user  2.892076  -0.4091  -2.7144\n",
-       "2      Topic2  0.864020     system  3.630010  -0.5073  -2.5854\n",
-       "4      Topic2  0.437504        eps  2.192159  -0.6835  -3.2659, token_table=      Topic      Freq       Term\n",
-       "term                            \n",
-       "5         1  0.478603   computer\n",
-       "5         2  0.478603   computer\n",
-       "4         1  0.912342        eps\n",
-       "1         1  0.367426      graph\n",
-       "1         2  0.734852      graph\n",
-       "8         1  0.465867      human\n",
-       "8         2  0.465867      human\n",
-       "10        1  0.927687  interface\n",
-       "10        2  0.463843  interface\n",
-       "0         1  0.479516     minors\n",
-       "0         2  0.479516     minors\n",
-       "11        1  0.470690   response\n",
-       "11        2  0.470690   response\n",
-       "6         1  0.487133     survey\n",
-       "6         2  0.487133     survey\n",
-       "2         1  0.826444     system\n",
-       "2         2  0.275481     system\n",
-       "9         1  0.470937       time\n",
-       "9         2  0.470937       time\n",
-       "3         1  0.358933      trees\n",
-       "3         2  0.358933      trees\n",
-       "7         1  0.691545       user\n",
-       "7         2  0.345772       user, R=12, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[2, 1])"
+       "<IPython.core.display.HTML object>"
       ]
      },
-     "execution_count": 18,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "pyLDAvis.gensim.prepare(goodLdaModel, corpus, dictionary)"
+    "if CAN_VISUALIZE:\n",
+    "    prepared = pyLDAvis.gensim.prepare(goodLdaModel, corpus, dictionary)\n",
+    "    display(pyLDAvis.display(prepared))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 8,
    "metadata": {
     "collapsed": false
    },
@@ -384,10 +296,10 @@
        "<link rel=\"stylesheet\" type=\"text/css\" href=\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css\">\n",
        "\n",
        "\n",
-       "<div id=\"ldavis_el97211400770039304485557333450\"></div>\n",
+       "<div id=\"ldavis_el4592645661155367187472854\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "\n",
-       "var ldavis_el97211400770039304485557333450_data = {\"plot.opts\": {\"xlab\": \"PC1\", \"ylab\": \"PC2\"}, \"topic.order\": [2, 1], \"token.table\": {\"Topic\": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], \"Freq\": [0.47136770689304686, 0.47136770689304686, 0.4714363999386586, 0.4714363999386586, 0.35354620598847575, 0.35354620598847575, 0.47153029671607571, 0.47153029671607571, 0.47123934722812777, 0.47123934722812777, 0.47117011634775935, 0.47117011634775935, 0.47121186304124502, 0.47121186304124502, 0.47112117283273591, 0.47112117283273591, 0.56545462979830119, 0.56545462979830119, 0.47101058417699188, 0.47101058417699188, 0.3534540666147919, 0.3534540666147919, 0.70681839866719087, 0.35340919933359544], \"Term\": [\"computer\", \"computer\", \"eps\", \"eps\", \"graph\", \"graph\", \"human\", \"human\", \"interface\", \"interface\", \"minors\", \"minors\", \"response\", \"response\", \"survey\", \"survey\", \"system\", \"system\", \"time\", \"time\", \"trees\", \"trees\", \"user\", \"user\"]}, \"mdsDat\": {\"y\": [-0.0, -0.0], \"cluster\": [1, 1], \"Freq\": [52.514670987555313, 47.48532901244468], \"topics\": [1, 2], \"x\": [-0.0024545927918686247, 0.0024545927918686247]}, \"R\": 12, \"lambda.step\": 0.01, \"tinfo\": {\"Category\": [\"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\"], \"Term\": [\"human\", \"eps\", \"graph\", \"time\", \"computer\", \"trees\", \"survey\", \"interface\", \"minors\", \"system\", \"user\", \"response\", \"time\", \"survey\", \"minors\", \"response\", \"system\", \"user\", \"interface\", \"trees\", \"computer\", \"graph\", \"eps\", \"human\", \"human\", \"eps\", \"graph\", \"computer\", \"trees\", \"interface\", \"user\", \"system\", \"response\", \"minors\", \"survey\", \"time\"], \"loglift\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.16569999999999999, 0.0969, 0.064799999999999996, 0.036600000000000001, 0.036400000000000002, 0.036299999999999999, 0.017600000000000001, -0.0054999999999999997, -0.076200000000000004, -0.097299999999999998, -0.1303, -0.20930000000000001, 0.18970000000000001, 0.12670000000000001, 0.097600000000000006, 0.078, 0.0060000000000000001, -0.019800000000000002, -0.041799999999999997, -0.041799999999999997, -0.042099999999999999, -0.076899999999999996, -0.1193, -0.2223], \"Freq\": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 1.3159068896432811, 1.2280436128475054, 1.1891708953915239, 1.1560205364187532, 1.9262662286104497, 1.5409343444354711, 1.1341990044178432, 1.4776087862735434, 1.0323193862239972, 1.347613921157538, 0.9778202097248585, 0.90335077124627938, 1.2174037316232018, 1.1433566873711567, 1.4808706815950541, 1.0891666328148497, 1.351613153588417, 0.98786488070144896, 1.2886467810294173, 1.6107107357720052, 0.96616712147658812, 0.93320479292977043, 0.89455256339149813, 0.80718765131554704], \"Total\": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.123094540958828, 2.1225961762390035, 2.1223756883212941, 2.1221876578953411, 3.5369769643824549, 2.8295811254648884, 2.1220638851192923, 2.8292219398619602, 2.1214860190388469, 2.8284846027525923, 2.1211768970960154, 2.120754502869481, 2.120754502869481, 2.1211768970960154, 2.8284846027525923, 2.1214860190388469, 2.8292219398619602, 2.1220638851192923, 2.8295811254648884, 3.5369769643824549, 2.1221876578953411, 2.1223756883212941, 2.1225961762390035, 2.123094540958828], \"logprob\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -2.4487000000000001, -2.5177999999999998, -2.5499999999999998, -2.5781999999999998, -2.0676000000000001, -2.2907999999999999, -2.5973000000000002, -2.3328000000000002, -2.6913999999999998, -2.4249000000000001, -2.7456, -2.8249, -2.4258000000000002, -2.4885999999999999, -2.2299000000000002, -2.5371000000000001, -2.3212000000000002, -2.6347999999999998, -2.3690000000000002, -2.1459000000000001, -2.657, -2.6917, -2.734, -2.8367]}};\n",
+       "var ldavis_el4592645661155367187472854_data = {\"plot.opts\": {\"xlab\": \"PC1\", \"ylab\": \"PC2\"}, \"topic.order\": [1, 2], \"token.table\": {\"Topic\": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], \"Freq\": [0.47127343135554306, 0.47127343135554306, 0.4712030409081992, 0.4712030409081992, 0.7070186647778998, 0.3535093323889499, 0.4712910662410224, 0.4712910662410224, 0.4711614921066636, 0.4711614921066636, 0.47132017677255156, 0.47132017677255156, 0.4712698177027075, 0.4712698177027075, 0.47138034409750634, 0.47138034409750634, 0.5656167397316655, 0.28280836986583274, 0.47126175533036385, 0.47126175533036385, 0.3533589346056535, 0.706717869211307, 0.3533729508860421, 0.7067459017720842], \"Term\": [\"computer\", \"computer\", \"eps\", \"eps\", \"graph\", \"graph\", \"human\", \"human\", \"interface\", \"interface\", \"minors\", \"minors\", \"response\", \"response\", \"survey\", \"survey\", \"system\", \"system\", \"time\", \"time\", \"trees\", \"trees\", \"user\", \"user\"]}, \"mdsDat\": {\"y\": [0.0, 0.0], \"cluster\": [1, 1], \"Freq\": [52.62107962766313, 47.37892037233687], \"topics\": [1, 2], \"x\": [0.0022452603163718794, -0.0022452603163718794]}, \"R\": 12, \"lambda.step\": 0.01, \"tinfo\": {\"Category\": [\"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Default\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic1\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\", \"Topic2\"], \"Term\": [\"trees\", \"user\", \"interface\", \"eps\", \"survey\", \"system\", \"time\", \"graph\", \"response\", \"computer\", \"human\", \"minors\", \"survey\", \"system\", \"graph\", \"minors\", \"human\", \"computer\", \"response\", \"time\", \"eps\", \"user\", \"interface\", \"trees\", \"trees\", \"interface\", \"user\", \"eps\", \"time\", \"response\", \"computer\", \"human\", \"minors\", \"graph\", \"system\", \"survey\"], \"loglift\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.1244, 0.0905, 0.0889, 0.0618, 0.03, 0.0103, 0.0062, -0.003, -0.0726, -0.1218, -0.125, -0.1462, 0.1407, 0.1226, 0.1199, 0.0749, 0.0033, -0.0069, -0.0116, -0.0344, -0.0734, -0.109, -0.1111, -0.1591], \"Freq\": [2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.264153321593625, 2.03682725788834, 1.6269062343609695, 1.1876099122291406, 1.150569185712821, 1.1281280371867524, 1.1235293015457002, 1.1132688699679398, 1.0385364388657357, 1.3183346816594803, 0.9856413649269657, 1.2866084860848348, 1.5433751426466535, 1.136773171814538, 1.5115366978352291, 1.0836909518416644, 1.0086941127888087, 0.9983973791712749, 0.9937823729198795, 0.9712618262261813, 0.9340900473348168, 1.2018733999479392, 1.499135982783817, 0.8572758226668903], \"Total\": [2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.121429144260515, 3.5359632406721575, 2.8287796343089084, 2.1216999595639576, 2.1218310119390025, 2.121910410106632, 2.121926680716975, 2.1219629827567488, 2.1222273907074003, 2.8298713794947092, 2.1224145367415037, 2.8299836287314886, 2.8299836287314886, 2.1224145367415037, 2.8298713794947092, 2.1222273907074003, 2.1219629827567488, 2.121926680716975, 2.121910410106632, 2.1218310119390025, 2.1216999595639576, 2.8287796343089084, 3.5359632406721575, 2.121429144260515], \"logprob\": [12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, -2.4908, -2.0138, -2.2386, -2.5533, -2.585, -2.6047, -2.6088, -2.6179, -2.6874, -2.4489, -2.7397, -2.4732, -2.1863, -2.4921, -2.2072, -2.5399, -2.6116, -2.6219, -2.6265, -2.6495, -2.6885, -2.4364, -2.2154, -2.7743]}};\n",
        "\n",
        "function LDAvis_load_lib(url, callback){\n",
        "  var s = document.createElement('script');\n",
@@ -401,7 +313,7 @@
        "if(typeof(LDAvis) !== \"undefined\"){\n",
        "   // already loaded: just create the visualization\n",
        "   !function(LDAvis){\n",
-       "       new LDAvis(\"#\" + \"ldavis_el97211400770039304485557333450\", ldavis_el97211400770039304485557333450_data);\n",
+       "       new LDAvis(\"#\" + \"ldavis_el4592645661155367187472854\", ldavis_el4592645661155367187472854_data);\n",
        "   }(LDAvis);\n",
        "}else if(typeof define === \"function\" && define.amd){\n",
        "   // require.js is available: use it to load d3/LDAvis\n",
@@ -409,119 +321,36 @@
        "   require([\"d3\"], function(d3){\n",
        "      window.d3 = d3;\n",
        "      LDAvis_load_lib(\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js\", function(){\n",
-       "        new LDAvis(\"#\" + \"ldavis_el97211400770039304485557333450\", ldavis_el97211400770039304485557333450_data);\n",
+       "        new LDAvis(\"#\" + \"ldavis_el4592645661155367187472854\", ldavis_el4592645661155367187472854_data);\n",
        "      });\n",
        "    });\n",
        "}else{\n",
        "    // require.js not available: dynamically load d3 & LDAvis\n",
        "    LDAvis_load_lib(\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js\", function(){\n",
        "         LDAvis_load_lib(\"https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js\", function(){\n",
-       "                 new LDAvis(\"#\" + \"ldavis_el97211400770039304485557333450\", ldavis_el97211400770039304485557333450_data);\n",
+       "                 new LDAvis(\"#\" + \"ldavis_el4592645661155367187472854\", ldavis_el4592645661155367187472854_data);\n",
        "            })\n",
        "         });\n",
        "}\n",
        "</script>"
       ],
       "text/plain": [
-       "PreparedData(topic_coordinates=            Freq  cluster  topics         x    y\n",
-       "topic                                           \n",
-       "1      52.514671        1       1 -0.002455 -0.0\n",
-       "0      47.485329        1       2  0.002455 -0.0, topic_info=     Category      Freq       Term     Total  loglift  logprob\n",
-       "term                                                          \n",
-       "8     Default  2.000000      human  2.000000  12.0000  12.0000\n",
-       "4     Default  2.000000        eps  2.000000  11.0000  11.0000\n",
-       "1     Default  2.000000      graph  2.000000  10.0000  10.0000\n",
-       "9     Default  2.000000       time  2.000000   9.0000   9.0000\n",
-       "5     Default  2.000000   computer  2.000000   8.0000   8.0000\n",
-       "3     Default  2.000000      trees  2.000000   7.0000   7.0000\n",
-       "6     Default  2.000000     survey  2.000000   6.0000   6.0000\n",
-       "10    Default  2.000000  interface  2.000000   5.0000   5.0000\n",
-       "0     Default  2.000000     minors  2.000000   4.0000   4.0000\n",
-       "2     Default  3.000000     system  3.000000   3.0000   3.0000\n",
-       "7     Default  2.000000       user  2.000000   2.0000   2.0000\n",
-       "11    Default  2.000000   response  2.000000   1.0000   1.0000\n",
-       "9      Topic1  1.315907       time  2.123095   0.1657  -2.4487\n",
-       "6      Topic1  1.228044     survey  2.122596   0.0969  -2.5178\n",
-       "0      Topic1  1.189171     minors  2.122376   0.0648  -2.5500\n",
-       "11     Topic1  1.156021   response  2.122188   0.0366  -2.5782\n",
-       "2      Topic1  1.926266     system  3.536977   0.0364  -2.0676\n",
-       "7      Topic1  1.540934       user  2.829581   0.0363  -2.2908\n",
-       "10     Topic1  1.134199  interface  2.122064   0.0176  -2.5973\n",
-       "3      Topic1  1.477609      trees  2.829222  -0.0055  -2.3328\n",
-       "5      Topic1  1.032319   computer  2.121486  -0.0762  -2.6914\n",
-       "1      Topic1  1.347614      graph  2.828485  -0.0973  -2.4249\n",
-       "4      Topic1  0.977820        eps  2.121177  -0.1303  -2.7456\n",
-       "8      Topic1  0.903351      human  2.120755  -0.2093  -2.8249\n",
-       "8      Topic2  1.217404      human  2.120755   0.1897  -2.4258\n",
-       "4      Topic2  1.143357        eps  2.121177   0.1267  -2.4886\n",
-       "1      Topic2  1.480871      graph  2.828485   0.0976  -2.2299\n",
-       "5      Topic2  1.089167   computer  2.121486   0.0780  -2.5371\n",
-       "3      Topic2  1.351613      trees  2.829222   0.0060  -2.3212\n",
-       "10     Topic2  0.987865  interface  2.122064  -0.0198  -2.6348\n",
-       "7      Topic2  1.288647       user  2.829581  -0.0418  -2.3690\n",
-       "2      Topic2  1.610711     system  3.536977  -0.0418  -2.1459\n",
-       "11     Topic2  0.966167   response  2.122188  -0.0421  -2.6570\n",
-       "0      Topic2  0.933205     minors  2.122376  -0.0769  -2.6917\n",
-       "6      Topic2  0.894553     survey  2.122596  -0.1193  -2.7340\n",
-       "9      Topic2  0.807188       time  2.123095  -0.2223  -2.8367, token_table=      Topic      Freq       Term\n",
-       "term                            \n",
-       "5         1  0.471368   computer\n",
-       "5         2  0.471368   computer\n",
-       "4         1  0.471436        eps\n",
-       "4         2  0.471436        eps\n",
-       "1         1  0.353546      graph\n",
-       "1         2  0.353546      graph\n",
-       "8         1  0.471530      human\n",
-       "8         2  0.471530      human\n",
-       "10        1  0.471239  interface\n",
-       "10        2  0.471239  interface\n",
-       "0         1  0.471170     minors\n",
-       "0         2  0.471170     minors\n",
-       "11        1  0.471212   response\n",
-       "11        2  0.471212   response\n",
-       "6         1  0.471121     survey\n",
-       "6         2  0.471121     survey\n",
-       "2         1  0.565455     system\n",
-       "2         2  0.565455     system\n",
-       "9         1  0.471011       time\n",
-       "9         2  0.471011       time\n",
-       "3         1  0.353454      trees\n",
-       "3         2  0.353454      trees\n",
-       "7         1  0.706818       user\n",
-       "7         2  0.353409       user, R=12, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[2, 1])"
+       "<IPython.core.display.HTML object>"
       ]
      },
-     "execution_count": 19,
      "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pyLDAvis.gensim.prepare(badLdaModel, corpus, dictionary)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-14.0842451581\n"
-     ]
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "print goodcm.get_coherence()"
+    "if CAN_VISUALIZE:\n",
+    "    prepared = pyLDAvis.gensim.prepare(badLdaModel, corpus, dictionary)\n",
+    "    display(pyLDAvis.display(prepared))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 9,
    "metadata": {
     "collapsed": false
    },
@@ -530,12 +359,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "-14.4434307511\n"
+      "-13.8029561191\n",
+      "-14.1531313765\n"
      ]
     }
    ],
    "source": [
-    "print badcm.get_coherence()"
+    "print(goodcm.get_coherence())\n",
+    "print(badcm.get_coherence())"
    ]
   },
   {
@@ -547,23 +378,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "goodcm = CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 10,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
+    "goodcm = CoherenceModel(model=goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')\n",
     "badcm = CoherenceModel(model=badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')"
    ]
   },
@@ -576,7 +397,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
@@ -585,12 +406,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CoherenceModel(segmentation=<function s_one_set at 0x7f663ae8a050>, probability estimation=<function p_boolean_sliding_window at 0x7f663ae8a320>, confirmation measure=<function cosine_similarity at 0x7f663ae8a938>, aggregation=<function arithmetic_mean at 0x7f663ae8aa28>)\n"
+      "Coherence_Measure(seg=<function s_one_set at 0x11e3217d0>, prob=<function p_boolean_sliding_window at 0x11e338938>, conf=<function cosine_similarity at 0x11e338b90>, aggr=<function arithmetic_mean at 0x11e33d230>)\n"
      ]
     }
    ],
    "source": [
-    "print goodcm"
+    "print(goodcm)"
    ]
   },
   {
@@ -602,26 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.552164532134\n"
-     ]
-    }
-   ],
-   "source": [
-    "print goodcm.get_coherence()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
@@ -630,12 +432,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.5269189184\n"
+      "0.379532110157\n",
+      "0.385963126348\n"
      ]
     }
    ],
    "source": [
-    "print badcm.get_coherence()"
+    "print(goodcm.get_coherence())\n",
+    "print(badcm.get_coherence())"
    ]
   },
   {
@@ -654,31 +458,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 13,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [],
    "source": [
-    "model1 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n",
-    "model2 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=1)"
+    "# Replace with path to your Vowpal Wabbit installation\n",
+    "vw_path = '/usr/local/bin/vw'\n",
+    "\n",
+    "# Replace with path to your Mallet installation\n",
+    "home = os.path.expanduser('~')\n",
+    "mallet_path = os.path.join(home, 'mallet-2.0.8', 'bin', 'mallet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 14,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n",
-    "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')"
+    "model1 = LdaVowpalWabbit(vw_path, corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n",
+    "model2 = LdaVowpalWabbit(vw_path, corpus=corpus, num_topics=2, id2word=dictionary, passes=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 15,
    "metadata": {
     "collapsed": false
    },
@@ -687,43 +495,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "-14.075813889\n",
-      "-15.1740896045\n"
+      "-13.226132904\n",
+      "-14.3236789858\n"
      ]
     }
    ],
    "source": [
-    "print cm1.get_coherence()\n",
-    "print cm2.get_coherence()"
+    "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n",
+    "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')\n",
+    "print(cm1.get_coherence())\n",
+    "print(cm2.get_coherence())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "model1 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=50)\n",
-    "model2 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=1)"
+    "model1 = LdaMallet(mallet_path, corpus=corpus, num_topics=2, id2word=dictionary, iterations=50)\n",
+    "model2 = LdaMallet(mallet_path, corpus=corpus, num_topics=2, id2word=dictionary, iterations=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n",
-    "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 17,
    "metadata": {
     "collapsed": false
    },
@@ -732,14 +530,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.581114877802\n",
-      "0.549865328265\n"
+      "0.37605697523\n",
+      "0.393714418809\n"
      ]
     }
    ],
    "source": [
-    "print cm1.get_coherence()\n",
-    "print cm2.get_coherence()"
+    "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n",
+    "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')\n",
+    "print(cm1.get_coherence())\n",
+    "print(cm2.get_coherence())"
    ]
   },
   {
@@ -752,9 +552,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 18,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -763,22 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# To get the topic words from the model\n",
-    "topics = []\n",
-    "for topic_id, topic in hm.show_topics(num_topics=10, formatted=False):\n",
-    "    topic = [word for word, _ in topic]\n",
-    "    topics.append(topic)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 19,
    "metadata": {
     "collapsed": false
    },
@@ -787,55 +572,48 @@
      "data": {
       "text/plain": [
        "[[u'minors',\n",
-       "  u'system',\n",
-       "  u'graph',\n",
-       "  u'human',\n",
-       "  u'interface',\n",
-       "  u'eps',\n",
-       "  u'trees',\n",
-       "  u'computer',\n",
        "  u'user',\n",
-       "  u'response',\n",
+       "  u'interface',\n",
+       "  u'system',\n",
        "  u'survey',\n",
-       "  u'time'],\n",
-       " [u'minors',\n",
+       "  u'response',\n",
        "  u'trees',\n",
+       "  u'computer',\n",
+       "  u'human',\n",
        "  u'time',\n",
-       "  u'interface',\n",
+       "  u'graph',\n",
+       "  u'eps'],\n",
+       " [u'response',\n",
+       "  u'trees',\n",
+       "  u'human',\n",
+       "  u'graph',\n",
        "  u'user',\n",
+       "  u'computer',\n",
+       "  u'interface',\n",
+       "  u'eps',\n",
        "  u'survey',\n",
        "  u'system',\n",
-       "  u'response',\n",
-       "  u'human',\n",
-       "  u'computer',\n",
-       "  u'graph',\n",
-       "  u'eps']]"
+       "  u'minors',\n",
+       "  u'time']]"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# To get the topic words from the model\n",
+    "topics = []\n",
+    "for topic_id, topic in hm.show_topics(num_topics=10, formatted=False):\n",
+    "    topic = [word for word, _ in topic]\n",
+    "    topics.append(topic)\n",
     "topics[:2]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# Initialize CoherenceModel using `topics` parameter\n",
-    "cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 20,
    "metadata": {
     "collapsed": false
    },
@@ -843,15 +621,17 @@
     {
      "data": {
       "text/plain": [
-       "-14.640667699204982"
+       "-14.611179327706207"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Initialize CoherenceModel using `topics` parameter\n",
+    "cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
     "cm.get_coherence()"
    ]
   },
@@ -886,7 +666,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,

From 96493eae1fdc3d8a70da69b1a32f33624bc111b7 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Sat, 17 Jun 2017 22:51:20 -0400
Subject: [PATCH 2/4] update topic coherence movies benchmark notebook to
 reflect the recent coherence optimizations

---
 docs/notebooks/topic_coherence-movies.ipynb | 344 ++++++++++----------
 1 file changed, 169 insertions(+), 175 deletions(-)

diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb
index 885d4e8e68..25978147a4 100644
--- a/docs/notebooks/topic_coherence-movies.ipynb
+++ b/docs/notebooks/topic_coherence-movies.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Benchmark testing of coherence pipeline on Movies dataset:\n",
+    "# Benchmark testing of coherence pipeline on Movies dataset\n",
     "## How to find how well coherence measure matches your manual annotators"
    ]
   },
@@ -15,29 +15,22 @@
     "__Introduction__: For the validation of any model adapted from a paper, it is of utmost importance that the results of benchmark testing on the datasets listed in the paper match between the actual implementation (palmetto) and gensim. This coherence pipeline has been implemented from the work done by Roeder et al. The paper can be found [here](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf).\n",
     "\n",
     "__Approach__ :\n",
-    "1. We will use the Movies dataset first. This dataset along with the topics on which the coherence is calculated and the gold (human) ratings on these topics can be found [here](http://139.18.2.164/mroeder/palmetto/datasets/).\n",
+    "1. In this notebook, we'll use the Movies dataset mentioned in the paper. This dataset along with the topics on which the coherence is calculated and the gold (human) ratings on these topics can be found [here](http://139.18.2.164/mroeder/palmetto/datasets/).\n",
     "2. We will then calculate the coherence on these topics using the pipeline implemented in gensim.\n",
-    "3. Once we have got all our coherence values on these topics we will calculate the correlation with the human ratings using pearson's r.\n",
+    "3. Once we have all our coherence values on these topics we will calculate the correlation with the human ratings using pearson's r.\n",
     "4. We will compare this final correlation value with the values listed in the paper and see if the pipeline is working as expected."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 1,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The line_profiler extension is already loaded. To reload it, use:\n",
-      "  %reload_ext line_profiler\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "from __future__ import print_function\n",
+    "\n",
     "import re\n",
     "import os\n",
     "\n",
@@ -45,31 +38,37 @@
     "from datetime import datetime\n",
     "\n",
     "from gensim.models import CoherenceModel\n",
-    "from gensim.corpora.dictionary import Dictionary\n",
-    "# %load_ext line_profiler  # This was used for finding out which line was taking maximum time for indirect confirmation measure"
+    "from gensim.corpora.dictionary import Dictionary"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Download the dataset from the link and plug in the location here"
+    "Download the dataset (`movie.zip`) and gold standard data (`topicsMovie.txt` and `goldMovie.txt`) from the link and plug in the locations below."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 2,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "prefix = \"/home/devashish/datasets/Movies/movie/\""
+    "base_dir = os.path.join(os.path.expanduser('~'), \"workshop/nlp/data/\")\n",
+    "data_dir = os.path.join(base_dir, 'wiki-movie-subset')\n",
+    "if not os.path.exists(data_dir):\n",
+    "    raise ValueError(\"SKIP: Please download the movie corpus.\")\n",
+    "\n",
+    "ref_dir = os.path.join(base_dir, 'reference')\n",
+    "topics_path = os.path.join(ref_dir, 'topicsMovie.txt')\n",
+    "human_scores_path = os.path.join(ref_dir, 'goldMovie.txt')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
    "metadata": {
     "collapsed": false
    },
@@ -78,31 +77,60 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken: 0:10:23.956500\n"
+      "PROGRESS: 10000/125384, preprocessed 9916, discarded 84\n",
+      "PROGRESS: 20000/125384, preprocessed 19734, discarded 266\n",
+      "PROGRESS: 30000/125384, preprocessed 29648, discarded 352\n",
+      "PROGRESS: 50000/125384, preprocessed 37074, discarded 12926\n",
+      "PROGRESS: 60000/125384, preprocessed 47003, discarded 12997\n",
+      "PROGRESS: 70000/125384, preprocessed 56961, discarded 13039\n",
+      "PROGRESS: 80000/125384, preprocessed 66891, discarded 13109\n",
+      "PROGRESS: 90000/125384, preprocessed 76784, discarded 13216\n",
+      "PROGRESS: 100000/125384, preprocessed 86692, discarded 13308\n",
+      "PROGRESS: 110000/125384, preprocessed 96593, discarded 13407\n",
+      "PROGRESS: 120000/125384, preprocessed 106522, discarded 13478\n",
+      "CPU times: user 19.8 s, sys: 9.55 s, total: 29.4 s\n",
+      "Wall time: 44.9 s\n"
      ]
     }
    ],
    "source": [
-    "import os\n",
-    "if not os.path.exists(prefix):\n",
-    "    raise ValueError(\"SKIP: Please download the movie corpus.\")\n",
+    "%%time\n",
     "\n",
-    "start = datetime.now()\n",
     "texts = []\n",
-    "for fil in os.listdir(prefix):\n",
-    "    for line in open(prefix + fil):\n",
-    "        # lower case all words\n",
-    "        lowered = line.lower()\n",
-    "        #remove punctuation and split into seperate words\n",
-    "        words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n",
-    "        texts.append(words)\n",
-    "end = datetime.now()\n",
-    "print(\"Time taken: %s\" % (end - start))"
+    "file_num = 0\n",
+    "preprocessed = 0\n",
+    "listing = os.listdir(data_dir)\n",
+    "\n",
+    "for fname in listing:\n",
+    "    file_num += 1\n",
+    "    if 'disambiguation' in fname:\n",
+    "        continue  # discard disambiguation and redirect pages\n",
+    "    elif fname.startswith('File_'):\n",
+    "        continue  # discard images, gifs, etc.\n",
+    "    elif fname.startswith('Category_'):\n",
+    "        continue  # discard category articles\n",
+    "        \n",
+    "    # Not sure how to identify portal and redirect pages,\n",
+    "    # as well as pages about a single year.\n",
+    "    # As a result, this preprocessing differs from the paper.\n",
+    "    \n",
+    "    with open(os.path.join(data_dir, fname)) as f:\n",
+    "        for line in f:\n",
+    "            # lower case all words\n",
+    "            lowered = line.lower()\n",
+    "            #remove punctuation and split into seperate words\n",
+    "            words = re.findall(r'\\w+', lowered, flags = re.UNICODE | re.LOCALE)\n",
+    "            texts.append(words)\n",
+    "            \n",
+    "    preprocessed += 1\n",
+    "    if file_num % 10000 == 0:\n",
+    "        print('PROGRESS: %d/%d, preprocessed %d, discarded %d' % (\n",
+    "            file_num, len(listing), preprocessed, (file_num - preprocessed)))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
    "metadata": {
     "collapsed": false
    },
@@ -111,16 +139,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken: 0:01:44.047829\n"
+      "CPU times: user 1min 26s, sys: 1.1 s, total: 1min 27s\n",
+      "Wall time: 1min 27s\n"
      ]
     }
    ],
    "source": [
-    "start = datetime.now()\n",
+    "%%time\n",
+    "\n",
     "dictionary = Dictionary(texts)\n",
-    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
-    "end = datetime.now()\n",
-    "print \"Time taken: %s\" % (end - start)"
+    "corpus = [dictionary.doc2bow(text) for text in texts]"
    ]
   },
   {
@@ -134,12 +162,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "According to the paper the number of documents should be 108952 with a vocabulary of 1625124. The difference is because of a difference in preprocessing. However the results obtained are still very similar."
+    "According to the paper the number of documents should be 108,952 with a vocabulary of 1,625,124. The difference is because of a difference in preprocessing. However the results obtained are still very similar."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
@@ -148,19 +176,19 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "124234\n",
-      "Dictionary(758123 unique tokens: [u'schelberger', u'mdbg', u'shatzky', u'bhetan', u'verplank']...)\n"
+      "111637\n",
+      "Dictionary(756837 unique tokens: [u'verplank', u'mdbg', u'shatzky', u'duelcity', u'dulcitone']...)\n"
      ]
     }
    ],
    "source": [
-    "print len(corpus)\n",
-    "print dictionary"
+    "print(len(corpus))\n",
+    "print(dictionary)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {
     "collapsed": false
    },
@@ -168,44 +196,57 @@
     {
      "data": {
       "text/plain": [
-       "[[]]"
+       "100"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "topics = []  # list of 100 topics\n",
-    "for l in open('/home/devashish/datasets/Movies/topicsMovie.txt'):\n",
-    "    topics.append([l.split()])\n",
-    "topics.pop(100)"
+    "with open(topics_path) as f:\n",
+    "    topics = [line.split() for line in f if line]\n",
+    "len(topics)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 7,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "human_scores = []\n",
-    "for l in open('/home/devashish/datasets/Movies/goldMovie.txt'):\n",
-    "    human_scores.append(float(l.strip()))"
+    "with open(human_scores_path) as f:\n",
+    "    for line in f:\n",
+    "        human_scores.append(float(line.strip()))\n",
+    "len(human_scores)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Start off with u_mass coherence measure."
+    "### Deal with any vocabulary mismatch."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 8,
    "metadata": {
     "collapsed": false
    },
@@ -214,35 +255,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken: 0:20:44.833342\n"
+      "Topics with out-of-vocab terms: 72\n"
      ]
     }
    ],
    "source": [
-    "start = datetime.now()\n",
-    "u_mass = []\n",
-    "flags = []\n",
-    "for n, topic in enumerate(topics):\n",
-    "    try:\n",
-    "        cm = CoherenceModel(topics=topic, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
-    "        u_mass.append(cm.get_coherence())\n",
-    "    except KeyError:\n",
-    "        flags.append(n)\n",
-    "end = datetime.now()\n",
-    "print \"Time taken: %s\" % (end - start)"
+    "# We first need to filter out any topics that contain terms not in our dictionary\n",
+    "# These may occur as a result of preprocessing steps differing from those used to\n",
+    "# produce the reference topics. In this case, this only occurs in one topic.\n",
+    "invalid_topic_indices = set(\n",
+    "    i for i, topic in enumerate(topics)\n",
+    "    if any(t not in dictionary.token2id for t in topic)\n",
+    ")\n",
+    "print(\"Topics with out-of-vocab terms: %s\" % ', '.join(map(str, invalid_topic_indices)))\n",
+    "usable_topics = [topic for i, topic in enumerate(topics) if i not in invalid_topic_indices]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Start c_v coherence measure\n",
-    "This is expected to take much more time since `c_v` uses a sliding window to perform probability estimation and uses the cosine similarity indirect confirmation measure."
+    "### Start off with u_mass coherence measure."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -251,132 +289,101 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken: 19:50:11.214341\n"
+      "Calculated u_mass coherence for 99 topics\n",
+      "CPU times: user 7.22 s, sys: 141 ms, total: 7.36 s\n",
+      "Wall time: 7.38 s\n"
      ]
     }
    ],
    "source": [
-    "start = datetime.now()\n",
-    "c_v = []\n",
-    "for n, topic in enumerate(topics):\n",
-    "    try:\n",
-    "        cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_v')\n",
-    "        c_v.append(cm.get_coherence())\n",
-    "    except KeyError:\n",
-    "        pass\n",
-    "end = datetime.now()\n",
-    "print \"Time taken: %s\" % (end - start)"
+    "%%time\n",
+    "\n",
+    "cm = CoherenceModel(topics=usable_topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')\n",
+    "u_mass = cm.get_coherence_per_topic()\n",
+    "print(\"Calculated u_mass coherence for %d topics\" % len(u_mass))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Start c_uci and c_npmi coherence measures\n",
-    "They should be taking lesser time than c_v but should have a higher correlation than u_mass"
+    "### Start c_v coherence measure\n",
+    "This is expected to take much more time since `c_v` uses a sliding window to perform probability estimation and uses the cosine similarity indirect confirmation measure."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken: 2:55:36.044760\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
    "source": [
-    "start = datetime.now()\n",
-    "c_uci = []\n",
-    "flags = []\n",
-    "for n, topic in enumerate(topics):\n",
-    "    try:\n",
-    "        cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_uci')\n",
-    "        c_uci.append(cm.get_coherence())\n",
-    "    except KeyError:\n",
-    "        flags.append(n)\n",
-    "end = datetime.now()\n",
-    "print \"Time taken: %s\" % (end - start)"
+    "%%time\n",
+    "\n",
+    "cm = CoherenceModel(topics=usable_topics, texts=texts, dictionary=dictionary, coherence='c_v')\n",
+    "c_v = cm.get_coherence_per_topic()\n",
+    "print(\"Calculated c_v coherence for %d topics\" % len(c_v))"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 20,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken: 2:53:55.424213\n"
-     ]
-    }
-   ],
    "source": [
-    "start = datetime.now()\n",
-    "c_npmi = []\n",
-    "for n, topic in enumerate(topics):\n",
-    "    print n\n",
-    "    try:\n",
-    "        cm = CoherenceModel(topics=topic, texts=texts, dictionary=dictionary, coherence='c_npmi')\n",
-    "        c_npmi.append(cm.get_coherence())\n",
-    "    except KeyError:\n",
-    "        pass\n",
-    "end = datetime.now()\n",
-    "print \"Time taken: %s\" % (end - start)"
+    "### Start c_uci and c_npmi coherence measures\n",
+    "c_v and c_uci and c_npmi all use the boolean sliding window approach of estimating probabilities. Since the `CoherenceModel` caches the accumulated statistics, calculation of c_uci and c_npmi are practically free after calculating c_v coherence. These two methods are simpler and were shown to correlate less with human judgements than c_v but more so than u_mass."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "final_scores = []\n",
-    "for n, score in enumerate(human_scores):\n",
-    "    if n not in flags:\n",
-    "        final_scores.append(score)"
+    "%%time\n",
+    "\n",
+    "cm.coherence = 'c_uci'\n",
+    "c_uci = cm.get_coherence_per_topic()\n",
+    "print(\"Calculated c_uci coherence for %d topics\" % len(c_uci))"
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
    "source": [
-    "One topic encountered a KeyError. This was because of a difference in preprocessing due to which one topic word wasn't found in the dictionary"
+    "%%time\n",
+    "\n",
+    "cm.coherence = 'c_npmi'\n",
+    "c_npmi = cm.get_coherence_per_topic()\n",
+    "print(\"Calculated c_npmi coherence for %d topics\" % len(c_npmi))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "99 99 99 99 99\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print len(u_mass), len(c_v), len(c_uci), len(c_npmi), len(final_scores)\n",
-    "# 1 topic has word(s) that is not in the dictionary. Probably some difference\n",
-    "# in preprocessing"
+    "final_scores = [\n",
+    "    score for i, score in enumerate(human_scores)\n",
+    "    if i not in invalid_topic_indices\n",
+    "]\n",
+    "len(final_scores)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The values in the paper were:\n",
+    "The [values in the paper](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf) were:\n",
     "\n",
     "__`u_mass` correlation__ : 0.093\n",
     "\n",
@@ -391,27 +398,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.133916622716\n",
-      "0.555948711374\n",
-      "0.414722858726\n",
-      "0.39935634517\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print pearsonr(u_mass, final_scores)[0]\n",
-    "print pearsonr(c_v, final_scores)[0]\n",
-    "print pearsonr(c_uci, final_scores)[0]\n",
-    "print pearsonr(c_npmi, final_scores)[0]"
+    "for our_scores in (u_mass, c_v, c_uci, c_npmi):\n",
+    "    print(pearsonr(our_scores, final_scores)[0])"
    ]
   },
   {
@@ -420,8 +414,8 @@
    "source": [
     "### Where do we go now?\n",
     "\n",
-    "- Preprocessing can be improved for this notebook by following the exact process mentioned in [this](http://arxiv.org/pdf/1403.6397v1.pdf) paper.\n",
-    "- The time required for completing all of these operations can be improved a lot by cythonising the operations."
+    "- Preprocessing can be improved for this notebook by following the exact process mentioned in the reference paper. Specifically: _All corpora as well as the complete Wikipedia used as reference corpus are preprocessed using lemmatization and stop word removal. Additionally, we removed portal and category articles, redirection and disambiguation pages as well as articles about single years._ Right now it differs only in not removing redirect and portal pages and pages about single years.\n",
+    "- The time required for completing all of these operations can be improved a lot by cythonising them."
    ]
   }
  ],
@@ -441,7 +435,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,

From b41968dd76826f3eedd16ceffb7c88add93816c5 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 19 Jun 2017 07:23:37 -0400
Subject: [PATCH 3/4] a few minor updates to the text of the topic coherence
 benchmark on the movies dataset

---
 docs/notebooks/topic_coherence-movies.ipynb | 74 ++++++++++++++++++---
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb
index 25978147a4..0c0587c96e 100644
--- a/docs/notebooks/topic_coherence-movies.ipynb
+++ b/docs/notebooks/topic_coherence-movies.ipynb
@@ -280,7 +280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {
     "collapsed": false
    },
@@ -313,11 +313,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculated c_v coherence for 99 topics\n",
+      "CPU times: user 38.5 s, sys: 5.52 s, total: 44 s\n",
+      "Wall time: 13min 8s\n"
+     ]
+    }
+   ],
    "source": [
     "%%time\n",
     "\n",
@@ -336,11 +346,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculated c_uci coherence for 99 topics\n",
+      "CPU times: user 95 ms, sys: 8.87 ms, total: 104 ms\n",
+      "Wall time: 97.2 ms\n"
+     ]
+    }
+   ],
    "source": [
     "%%time\n",
     "\n",
@@ -351,11 +371,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculated c_npmi coherence for 99 topics\n",
+      "CPU times: user 192 ms, sys: 6.38 ms, total: 198 ms\n",
+      "Wall time: 194 ms\n"
+     ]
+    }
+   ],
    "source": [
     "%%time\n",
     "\n",
@@ -366,11 +396,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "99"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "final_scores = [\n",
     "    score for i, score in enumerate(human_scores)\n",
@@ -398,11 +439,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.158529392277\n",
+      "0.530450687702\n",
+      "0.406162050908\n",
+      "0.46002144316\n"
+     ]
+    }
+   ],
    "source": [
     "for our_scores in (u_mass, c_v, c_uci, c_npmi):\n",
     "    print(pearsonr(our_scores, final_scores)[0])"

From b18b09d3a586b180493e1b3060dd48a793f0d543 Mon Sep 17 00:00:00 2001
From: "Sweeney, Mack" <mackenzie.sweeney@capitalone.com>
Date: Mon, 19 Jun 2017 10:19:23 -0400
Subject: [PATCH 4/4] add new notebook demonstrating use of the CoherenceModel
 for model selection

---
 docs/notebooks/topic_coherence-movies.ipynb   |  15 +-
 .../topic_coherence_model_selection.ipynb     | 523 ++++++++++++++++++
 2 files changed, 535 insertions(+), 3 deletions(-)
 create mode 100644 docs/notebooks/topic_coherence_model_selection.ipynb

diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb
index 0c0587c96e..983905b31e 100644
--- a/docs/notebooks/topic_coherence-movies.ipynb
+++ b/docs/notebooks/topic_coherence-movies.ipynb
@@ -434,7 +434,7 @@
     "\n",
     "__`c_npmi` correlation__ : 0.438\n",
     "\n",
-    "Our values are also very similar to these values which is good. This validates the correctness of our pipeline."
+    "Our values are also very similar to these values which is good. This validates the correctness of our pipeline, as we can reasonably attribute the differences to differences in preprocessing."
    ]
   },
   {
@@ -466,9 +466,18 @@
    "source": [
     "### Where do we go now?\n",
     "\n",
-    "- Preprocessing can be improved for this notebook by following the exact process mentioned in the reference paper. Specifically: _All corpora as well as the complete Wikipedia used as reference corpus are preprocessed using lemmatization and stop word removal. Additionally, we removed portal and category articles, redirection and disambiguation pages as well as articles about single years._ Right now it differs only in not removing redirect and portal pages and pages about single years.\n",
-    "- The time required for completing all of these operations can be improved a lot by cythonising them."
+    "- The time required for completing all of these operations can be improved a lot by cythonising them.\n",
+    "- Preprocessing can be improved for this notebook by following the exact process mentioned in the reference paper. Specifically: _All corpora as well as the complete Wikipedia used as reference corpus are preprocessed using lemmatization and stop word removal. Additionally, we removed portal and category articles, redirection and disambiguation pages as well as articles about single years._ *Note*: we tried lemmatizing and found that significantly more of the reference topics had out-of-vocabulary terms."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/docs/notebooks/topic_coherence_model_selection.ipynb b/docs/notebooks/topic_coherence_model_selection.ipynb
new file mode 100644
index 0000000000..e0c0efbd4f
--- /dev/null
+++ b/docs/notebooks/topic_coherence_model_selection.ipynb
@@ -0,0 +1,523 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Performing Model Selection Using Topic Coherence\n",
+    "\n",
+    "This notebook will perform topic modeling on the 20 Newsgroups corpus using LDA. We will perform model selection (over the number of topics) using topic coherence as our evaluation metric. This will showcase some of the features of the topic coherence pipeline implemented in `gensim`. In particular, we will see several features of the `CoherenceModel`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "\n",
+    "import os\n",
+    "import re\n",
+    "\n",
+    "from gensim.corpora import TextCorpus, MmCorpus\n",
+    "from gensim import utils, models\n",
+    "from gensim.parsing.preprocessing import STOPWORDS\n",
+    "from gensim.utils import deaccent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parsing the Dataset\n",
+    "\n",
+    "The 20 Newsgroups dataset uses a hierarchical directory structure to store the articles. The structure looks something like this:\n",
+    "```\n",
+    "20news-18828/\n",
+    "|-- alt.atheism\n",
+    "|   |-- 49960\n",
+    "|   |-- 51060\n",
+    "|   |-- 51119\n",
+    "|-- comp.graphics\n",
+    "|   |-- 37261\n",
+    "|   |-- 37913\n",
+    "|   |-- 37914\n",
+    "|-- comp.os.ms-windows.misc\n",
+    "|   |-- 10000\n",
+    "|   |-- 10001\n",
+    "|   |-- 10002\n",
+    "```\n",
+    "\n",
+    "The files are in the newsgroup markup format, which includes some headers, quoting of previous messages in the thread, and possibly PGP signature blocks. The message body itself is raw text, which requires preprocessing. The code immediately below is an adaptation of [an active PR](https://github.com/RaRe-Technologies/gensim/pull/1388) for parsing hierarchical directory structures into corpora. The code just below that builds on this basic corpus parser to handle the newsgroup-specific text parsing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "class TextDirectoryCorpus(TextCorpus):\n",
+    "    \"\"\"Read documents recursively from a directory,\n",
+    "    where each file is interpreted as a plain text document.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def iter_filepaths(self):\n",
+    "        \"\"\"Lazily yield paths to each file in the directory structure within the specified\n",
+    "        range of depths. If a filename pattern to match was given, further filter to only\n",
+    "        those filenames that match.\n",
+    "        \"\"\"\n",
+    "        for dirpath, dirnames, filenames in os.walk(self.input):\n",
+    "            for name in filenames:\n",
+    "                yield os.path.join(dirpath, name)\n",
+    "                \n",
+    "    def getstream(self):\n",
+    "        for path in self.iter_filepaths():\n",
+    "            with utils.smart_open(path) as f:\n",
+    "                doc_content = f.read()\n",
+    "            yield doc_content\n",
+    "    \n",
+    "    def preprocess_text(self, text):\n",
+    "        text = deaccent(\n",
+    "            lower_to_unicode(\n",
+    "                strip_multiple_whitespaces(text)))\n",
+    "        tokens = simple_tokenize(text)\n",
+    "        return remove_short(\n",
+    "            remove_stopwords(tokens))\n",
+    "        \n",
+    "    def get_texts(self):\n",
+    "        \"\"\"Iterate over the collection, yielding one document at a time. A document\n",
+    "        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.\n",
+    "        Override this function to match your input (parse input files, do any\n",
+    "        text preprocessing, lowercasing, tokenizing etc.). There will be no further\n",
+    "        preprocessing of the words coming out of this function.\n",
+    "        \"\"\"\n",
+    "        lines = self.getstream()\n",
+    "        if self.metadata:\n",
+    "            for lineno, line in enumerate(lines):\n",
+    "                yield self.preprocess_text(line), (lineno,)\n",
+    "        else:\n",
+    "            for line in lines:\n",
+    "                yield self.preprocess_text(line)\n",
+    "\n",
+    "    \n",
+    "def remove_stopwords(tokens, stopwords=STOPWORDS):\n",
+    "    return [token for token in tokens if token not in stopwords]\n",
+    "\n",
+    "def remove_short(tokens, minsize=3):\n",
+    "    return [token for token in tokens if len(token) >= minsize]\n",
+    "\n",
+    "def lower_to_unicode(text):\n",
+    "    return utils.to_unicode(text.lower(), 'ascii', 'ignore')\n",
+    "\n",
+    "RE_WHITESPACE = re.compile(r\"(\\s)+\", re.UNICODE)\n",
+    "def strip_multiple_whitespaces(text):\n",
+    "    return RE_WHITESPACE.sub(\" \", text)\n",
+    "\n",
+    "PAT_ALPHABETIC = re.compile('(((?![\\d])\\w)+)', re.UNICODE)\n",
+    "def simple_tokenize(text):\n",
+    "    for match in PAT_ALPHABETIC.finditer(text):\n",
+    "        yield match.group()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "class NewsgroupCorpus(TextDirectoryCorpus):\n",
+    "    \"\"\"Parse 20 Newsgroups dataset.\"\"\"\n",
+    "\n",
+    "    def extract_body(self, text):\n",
+    "        return strip_newsgroup_header(\n",
+    "            strip_newsgroup_footer(\n",
+    "                strip_newsgroup_quoting(text)))\n",
+    "\n",
+    "    def preprocess_text(self, text):\n",
+    "        body = self.extract_body(text)\n",
+    "        return super(NewsgroupCorpus, self).preprocess_text(body)\n",
+    "\n",
+    "\n",
+    "def strip_newsgroup_header(text):\n",
+    "    \"\"\"Given text in \"news\" format, strip the headers, by removing everything\n",
+    "    before the first blank line.\n",
+    "    \"\"\"\n",
+    "    _before, _blankline, after = text.partition('\\n\\n')\n",
+    "    return after\n",
+    "\n",
+    "\n",
+    "_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'\n",
+    "                       r'|^In article|^Quoted from|^\\||^>)')\n",
+    "def strip_newsgroup_quoting(text):\n",
+    "    \"\"\"Given text in \"news\" format, strip lines beginning with the quote\n",
+    "    characters > or |, plus lines that often introduce a quoted section\n",
+    "    (for example, because they contain the string 'writes:'.)\n",
+    "    \"\"\"\n",
+    "    good_lines = [line for line in text.split('\\n')\n",
+    "                  if not _QUOTE_RE.search(line)]\n",
+    "    return '\\n'.join(good_lines)\n",
+    "\n",
+    "\n",
+    "_PGP_SIG_BEGIN = \"-----BEGIN PGP SIGNATURE-----\"\n",
+    "def strip_newsgroup_footer(text):\n",
+    "    \"\"\"Given text in \"news\" format, attempt to remove a signature block.\"\"\"\n",
+    "    try:\n",
+    "        return text[:text.index(_PGP_SIG_BEGIN)]\n",
+    "    except ValueError:\n",
+    "        return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loading the Dataset\n",
+    "\n",
+    "Now that we have defined the necessary code for parsing the dataset, let's load it up and serialize it into Matrix Market format. We'll do this because we want to train LDA on it with several different parameter settings, and this will allow us to avoid repeating the preprocessing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Replace data_path with path to your own copy of the corpus.\n",
+    "# You can download it from here: http://qwone.com/~jason/20Newsgroups/\n",
+    "# I'm using the original, called: 20news-19997.tar.gz\n",
+    "\n",
+    "home = os.path.expanduser('~')\n",
+    "data_dir = os.path.join(home, 'workshop', 'nlp', 'data')\n",
+    "data_path = os.path.join(data_dir, '20_newsgroups')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "19998\n",
+      "Dictionary(107980 unique tokens: [u'jbwn', u'porkification', u'sowell', u'sonja', u'luanch']...)\n",
+      "CPU times: user 38.3 s, sys: 2.43 s, total: 40.7 s\n",
+      "Wall time: 43.7 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "corpus = NewsgroupCorpus(data_path)\n",
+    "dictionary = corpus.dictionary\n",
+    "print(len(corpus))\n",
+    "print(dictionary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 25.9 s, sys: 2.76 s, total: 28.7 s\n",
+      "Wall time: 34 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "mm_path = os.path.join(data_dir, '20_newsgroups.mm')\n",
+    "MmCorpus.serialize(mm_path, corpus, id2word=dictionary)\n",
+    "mm_corpus = MmCorpus(mm_path)  # load back in to use for LDA training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training the Models\n",
+    "\n",
+    "Our goal is to determine which number of topics produces the most coherent topics for the 20 Newsgroups corpus. The corpus is roughly 20,000 documents. If we used 100 topics and the documents were evenly distributed among topics, we'd have clusters of 200 documents. This seems like a reasonable upper bound. In this case, the corpus actually has categories, defined by the first-level directory structure. This can be seen in the directory structure shown above, and three examples are: `alt.atheism`, `comp.graphics`, and `comp.os.ms-windows.misc`. There are 20 of these (hence the name of the dataset), so we'll use 20 as our lower bound for the number of topics.\n",
+    "\n",
+    "One could argue that we already know the model should have 20 topics. I'll argue there may be additional categorizations within each newsgroup and we might hope to capture those by using more topics. We'll step by increments of 10 from 20 to 100."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training LDA(k=20)\n",
+      "Training LDA(k=30)\n",
+      "Training LDA(k=40)\n",
+      "Training LDA(k=50)\n",
+      "Training LDA(k=60)\n",
+      "Training LDA(k=70)\n",
+      "Training LDA(k=80)\n",
+      "Training LDA(k=90)\n",
+      "Training LDA(k=100)\n",
+      "CPU times: user 1h 27min 7s, sys: 7min 54s, total: 1h 35min 2s\n",
+      "Wall time: 1h 3min 27s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "trained_models = {}\n",
+    "for num_topics in range(20, 101, 10):\n",
+    "    print(\"Training LDA(k=%d)\" % num_topics)\n",
+    "    lda = models.LdaMulticore(\n",
+    "        mm_corpus, id2word=dictionary, num_topics=num_topics, workers=4,\n",
+    "        passes=10, iterations=200, random_state=42,\n",
+    "        alpha='asymmetric',  # shown to be better than symmetric in most cases\n",
+    "        decay=0.5, offset=64  # best params from Hoffman paper\n",
+    "    )\n",
+    "    trained_models[num_topics] = lda"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation Using Coherence\n",
+    "\n",
+    "Now we get to the heart of this notebook. In this section, we'll evaluate each of our LDA models using topic coherence. Coherence is a measure of how interpretable the topics are to humans. It is based on the representation of topics as the top-N most probable words for a particular topic. More specifically, given the topic-term matrix for LDA, we sort each topic from highest to lowest term weights and then select the first N terms.\n",
+    "\n",
+    "Coherence essentially measures how similar these words are to each other. There are various methods for doing this, most of which have been explored in the paper [\"Exploring the Space of Topic Coherence Measures\"](https://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf). The authors performed a comparative analysis of various methods, correlating them to human judgements. The method named \"c_v\" coherence was found to be the most highly correlated. This and several of the other methods have been implemented in `gensim.models.CoherenceModel`. We will use this to perform our evaluations.\n",
+    "\n",
+    "The \"c_v\" coherence method makes an expensive pass over the corpus, accumulating term occurrence and co-occurrence counts. It only accumulates counts for the terms in the lists of top-N terms for each topic. In order to ensure we only need to make one pass, we'll construct a \"super topic\" from the top-N lists of each of the models. This will consist of a single topic with all the relevant terms from all the models. We choose 20 as N."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of relevant terms: 3517\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Build topic listings from each model.\n",
+    "import itertools\n",
+    "from gensim import matutils\n",
+    "\n",
+    "\n",
+    "def top_topics(lda, num_words=20):\n",
+    "    str_topics = []\n",
+    "    for topic in lda.state.get_lambda():\n",
+    "        topic = topic / topic.sum()  # normalize to probability distribution\n",
+    "        bestn = matutils.argsort(topic, topn=num_words, reverse=True)\n",
+    "        beststr = [lda.id2word[_id] for _id in bestn]\n",
+    "        str_topics.append(beststr)\n",
+    "    return str_topics\n",
+    "\n",
+    "\n",
+    "model_topics = {}\n",
+    "super_topic = set()\n",
+    "for num_topics, model in trained_models.items():\n",
+    "    topics_as_topn_terms = top_topics(model)\n",
+    "    model_topics[num_topics] = topics_as_topn_terms\n",
+    "    super_topic.update(itertools.chain.from_iterable(topics_as_topn_terms))\n",
+    "    \n",
+    "print(\"Number of relevant terms: %d\" % len(super_topic))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 34 s, sys: 3.1 s, total: 37.1 s\n",
+      "Wall time: 56.9 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# Now estimate the probabilities for the CoherenceModel\n",
+    "\n",
+    "cm = models.CoherenceModel(\n",
+    "    topics=[super_topic], texts=corpus.get_texts(),\n",
+    "    dictionary=dictionary, coherence='c_v')\n",
+    "cm.estimate_probabilities()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Avg coherence for num_topics=100: 0.48958\n",
+      "Avg coherence for num_topics=70: 0.50393\n",
+      "Avg coherence for num_topics=40: 0.51029\n",
+      "Avg coherence for num_topics=80: 0.51147\n",
+      "Avg coherence for num_topics=50: 0.51582\n",
+      "Avg coherence for num_topics=20: 0.49602\n",
+      "Avg coherence for num_topics=90: 0.47067\n",
+      "Avg coherence for num_topics=60: 0.48913\n",
+      "Avg coherence for num_topics=30: 0.48709\n",
+      "CPU times: user 2min 39s, sys: 524 ms, total: 2min 39s\n",
+      "Wall time: 2min 40s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "import numpy as np\n",
+    "# Next we perform the coherence evaluation for each of the models.\n",
+    "# Since we have already precomputed the probabilities, this simply\n",
+    "# involves using the accumulated stats in the `CoherenceModel` to\n",
+    "# perform the evaluations, which should be pretty quick.\n",
+    "\n",
+    "coherences = {}\n",
+    "for num_topics, topics in model_topics.items():\n",
+    "    cm.topics = topics\n",
+    "\n",
+    "    # We evaluate at various values of N and average them. This is a more robust,\n",
+    "    # according to: http://people.eng.unimelb.edu.au/tbaldwin/pubs/naacl2016.pdf\n",
+    "    coherence_at_n = {}\n",
+    "    for n in (20, 15, 10, 5):\n",
+    "        cm.topn = n\n",
+    "        topic_coherences = cm.get_coherence_per_topic()\n",
+    "        \n",
+    "        # Let's record the coherences for each topic, as well as the aggregated\n",
+    "        # coherence across all of the topics.\n",
+    "        coherence_at_n[n] = (topic_coherences, cm.aggregate_measures(topic_coherences))\n",
+    "        \n",
+    "    topic_coherences, avg_coherences = zip(*coherence_at_n.values())\n",
+    "    avg_topic_coherences = np.vstack(topic_coherences).mean(0)\n",
+    "    avg_coherence = np.mean(avg_coherences)\n",
+    "    print(\"Avg coherence for num_topics=%d: %.5f\" % (num_topics, avg_coherence))\n",
+    "    coherences[num_topics] = (avg_topic_coherences, avg_coherence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ranked by average 'c_v' coherence:\n",
+      "\n",
+      "num_topics=50:\t0.5158\n",
+      "num_topics=80:\t0.5115\n",
+      "num_topics=40:\t0.5103\n",
+      "num_topics=70:\t0.5039\n",
+      "num_topics=20:\t0.4960\n",
+      "num_topics=100:\t0.4896\n",
+      "num_topics=60:\t0.4891\n",
+      "num_topics=30:\t0.4871\n",
+      "num_topics=90:\t0.4707\n",
+      "\n",
+      "Best: 50\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Print the coherence rankings\n",
+    "\n",
+    "avg_coherence = \\\n",
+    "    [(num_topics, avg_coherence)\n",
+    "     for num_topics, (_, avg_coherence) in coherences.items()]\n",
+    "ranked = sorted(avg_coherence, key=lambda tup: tup[1], reverse=True)\n",
+    "print(\"Ranked by average '%s' coherence:\\n\" % cm.coherence)\n",
+    "for item in ranked:\n",
+    "    print(\"num_topics=%d:\\t%.4f\" % item)\n",
+    "print(\"\\nBest: %d\" % ranked[0][0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "In this notebook, we used `gensim`'s `CoherenceModel` to perform model selection over the number of topics for LDA. We found that for the 20 Newsgroups corpus, 50 topics is best. We showcased the ability of the coherence pipeline to evaluate individual topic coherence as well as aggregated model coherence. We also demonstrated how to avoid repeated passes over the corpus, estimating the term similarity probabilities for all relevant terms just once. Topic coherence is a powerful alternative to evaluation using perplexity on a held-out document set. It is appropriate to use whenever the objective of the topic modeling is to present the topics as top-N lists for human consumption.\n",
+    "\n",
+    "Note that coherence calculations are generally much more accurate when a larger reference corpus is used to estimate the probabilities. In this case, we used the same corpus as for our modeling, which is relatively small at only 20 documents. A better reference corpus is the full Wikipedia corpus. The motivated explorer of this notebook is encouraged to download that corpus (see [Experiments on the English Wikipedia](https://radimrehurek.com/gensim/wiki.html)) and use it for probability estimation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}