Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim …

…into fix1294
piskvorky · May 23, 2017 · 690251e · 690251e
2 parents 2eef0b6 + a49aa9b
commit 690251e
Show file tree

Hide file tree

Showing 35 changed files with 1,956 additions and 450 deletions.
diff --git a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -27,6 +27,20 @@
     "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tempfile\n",
+    "TEMP_FOLDER = tempfile.gettempdir()\n",
+    "print('Folder \"{}\" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -40,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -145,13 +159,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dictionary(12 unique tokens: ['response', 'survey', 'computer', 'user', 'minors']...)\n"
+      "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
      ]
     }
    ],
    "source": [
     "dictionary = corpora.Dictionary(texts)\n",
-    "dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference\n",
+    "dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))  # store the dictionary, for future reference\n",
     "print(dictionary)"
    ]
   },
@@ -173,7 +187,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'response': 3, 'survey': 4, 'computer': 2, 'user': 5, 'minors': 11, 'time': 6, 'system': 7, 'graph': 10, 'interface': 1, 'human': 0, 'eps': 8, 'trees': 9}\n"
+      "{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}\n"
      ]
     }
    ],
@@ -213,7 +227,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The function `doc2bow()` simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. The sparse vector `[(word_id, 1), (word_id, 1)]` therefore reads: in the document *“Human computer interaction”*, the words *\"computer\"* and *\"human\"*, identified by an integer id given by the built dictionary, appear once; the other ten dictionary words appear (implicitly) zero times. Check their id at the dictionary displayed in the previous cell and see that they match."
+    "The function `doc2bow()` simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector, in the form of `[(word_id, word_count), ...]`. \n",
+    "\n",
+    "As the token_id is 0 for *\"human\"* and 2 for *\"computer\"*, the new document *“Human computer interaction”* will be transformed to [(0, 1), (2, 1)]. The words *\"computer\"* and *\"human\"* exist in the dictionary and appear once. Thus, they become (0, 1), (2, 1) respectively in the sparse vector. The word *\"interaction\"* doesn't exist in the dictionary and, thus, will not show up in the sparse vector. The other ten dictionary words, that appear (implicitly) zero times, will not show up in the sparse vector and , ,there will never be a element in the sparse vector like (3, 0).\n",
+    "\n",
+    "For people familiar with scikit learn, `doc2bow()` has similar behaviors as calling `transform()` on [`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). `doc2bow()` can behave like `fit_transform()` as well. For more details, please look at [gensim API Doc](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2bow)."
    ]
   },
   {
@@ -229,19 +247,19 @@
      "text": [
       "[(0, 1), (1, 1), (2, 1)]\n",
       "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
-      "[(1, 1), (5, 1), (7, 1), (8, 1)]\n",
-      "[(0, 1), (7, 2), (8, 1)]\n",
-      "[(3, 1), (5, 1), (6, 1)]\n",
+      "[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
+      "[(0, 1), (5, 2), (8, 1)]\n",
+      "[(4, 1), (6, 1), (7, 1)]\n",
       "[(9, 1)]\n",
       "[(9, 1), (10, 1)]\n",
       "[(9, 1), (10, 1), (11, 1)]\n",
-      "[(4, 1), (10, 1), (11, 1)]\n"
+      "[(3, 1), (10, 1), (11, 1)]\n"
      ]
     }
    ],
    "source": [
     "corpus = [dictionary.doc2bow(text) for text in texts]\n",
-    "corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use\n",
+    "corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus)  # store to disk, for later use\n",
     "for c in corpus:\n",
     "    print(c)"
    ]
@@ -290,7 +308,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "<__main__.MyCorpus object at 0x7f4ad14856a0>\n"
+      "<__main__.MyCorpus object at 0x000002520A52E0B8>\n"
      ]
     }
    ],
@@ -308,7 +326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {
     "collapsed": false
    },
@@ -319,13 +337,13 @@
      "text": [
       "[(0, 1), (1, 1), (2, 1)]\n",
       "[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
-      "[(1, 1), (5, 1), (7, 1), (8, 1)]\n",
-      "[(0, 1), (7, 2), (8, 1)]\n",
-      "[(3, 1), (5, 1), (6, 1)]\n",
+      "[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
+      "[(0, 1), (5, 2), (8, 1)]\n",
+      "[(4, 1), (6, 1), (7, 1)]\n",
       "[(9, 1)]\n",
       "[(9, 1), (10, 1)]\n",
       "[(9, 1), (10, 1), (11, 1)]\n",
-      "[(4, 1), (10, 1), (11, 1)]\n"
+      "[(3, 1), (10, 1), (11, 1)]\n"
      ]
     }
    ],
@@ -340,12 +358,12 @@
    "source": [
     "Although the output is the same as for the plain Python list, the corpus is now much more memory friendly, because at most one vector resides in RAM at a time. Your corpus can now be as large as you want.\n",
     "\n",
-    "Similarly, to construct the dictionary without loading all texts into memory:"
+    "We are going to create the dictionary from the mycorpus.txt file without loading the entire file into memory. Then, we will generate the list of token ids to remove from this dictionary by querying the dictionary for the token ids of the stop words, and by querying the document frequencies dictionary (dictionary.dfs) for token ids that only appear once. Finally, we will filter these token ids out of our dictionary and call dictionary.compactify() to remove the gaps in the token id series."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {
     "collapsed": false
    },
@@ -354,7 +372,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dictionary(12 unique tokens: ['response', 'computer', 'survey', 'user', 'minors']...)\n"
+      "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
      ]
     }
    ],
@@ -392,36 +410,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
     "# create a toy corpus of 2 documents, as a plain Python list\n",
     "corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it\n",
     "\n",
-    "corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)"
+    "corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Other formats include [Joachim’s SVMlight format](http://svmlight.joachims.org/), [Blei’s LDA-C format](http://www.cs.princeton.edu/~blei/lda-c/) and [GibbsLDA++ format](http://gibbslda.sourceforge.net/)."
+    "Other formats include [Joachim’s SVMlight format](http://svmlight.joachims.org/), [Blei’s LDA-C format](http://www.cs.columbia.edu/~blei/lda-c/) and [GibbsLDA++ format](http://gibbslda.sourceforge.net/)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)\n",
-    "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)\n",
-    "corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)"
+    "corpora.SvmLightCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.svmlight'), corpus)\n",
+    "corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)\n",
+    "corpora.LowCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.low'), corpus)"
    ]
   },
   {
@@ -433,13 +451,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "corpus = corpora.MmCorpus('/tmp/corpus.mm')"
+    "corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'corpus.mm'))"
    ]
   },
   {
@@ -451,7 +469,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {
     "collapsed": false
    },
@@ -477,7 +495,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "metadata": {
     "collapsed": false
    },
@@ -504,7 +522,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "metadata": {
     "collapsed": false
    },
@@ -535,13 +553,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
-    "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)"
+    "corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)"
    ]
   },
   {
@@ -557,7 +575,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "metadata": {
     "collapsed": false
    },
@@ -579,7 +597,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {
     "collapsed": false
    },