From f0ca7e53a7551df880e24cda9675f3770871dd89 Mon Sep 17 00:00:00 2001 From: "Jonathan B. Coe" Date: Sun, 7 May 2017 14:23:52 +0100 Subject: [PATCH] add explanation of `size` minor typo fixes. --- docs/notebooks/word2vec.ipynb | 553 ++++++++++++++++++---------------- 1 file changed, 294 insertions(+), 259 deletions(-) diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb index 1f490950fa..d1af8c15e7 100644 --- a/docs/notebooks/word2vec.ipynb +++ b/docs/notebooks/word2vec.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Word2Vec Tutorial\n", "\n", @@ -22,10 +19,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Preparing the Input\n", "Starting from the beginning, gensim’s `word2vec` expects a sequence of sentences as its input. Each sentence a list of words (utf8 strings):" @@ -34,11 +28,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# import modules & set up logging\n", @@ -49,17 +39,29 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + "2017-05-07 14:19:35,470 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:35,473 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:35,474 : INFO : collected 3 word types from a corpus of 4 raw words and 2 sentences\n", + "2017-05-07 14:19:35,476 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:35,477 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)\n", + "2017-05-07 14:19:35,478 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)\n", + "2017-05-07 14:19:35,480 : INFO : deleting the raw counts dictionary of 3 items\n", + "2017-05-07 14:19:35,481 : INFO : sample=0.001 downsamples 3 most-common words\n", + "2017-05-07 14:19:35,483 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)\n", + "2017-05-07 14:19:35,484 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes\n", + "2017-05-07 14:19:35,485 : INFO : resetting layer weights\n", + "2017-05-07 14:19:35,487 : INFO : training model with 3 workers on 3 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:35,490 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:35,490 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:35,492 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:35,494 : INFO : training on 20 raw words (0 effective words) took 0.0s, 0 effective words/s\n", + "2017-05-07 14:19:35,497 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] } ], @@ -71,10 +73,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Keeping the input as a Python built-in list is convenient, but can use up a lot of RAM when the input is large.\n", "\n", @@ -87,9 +86,7 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -111,9 +108,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -130,17 +125,13 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[['second'], ['sentence'], ['first'], ['sentence']]\n" + "[['first'], ['sentence'], ['second'], ['sentence']]\n" ] } ], @@ -152,17 +143,29 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + "2017-05-07 14:19:35,568 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:35,574 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:35,578 : INFO : collected 3 word types from a corpus of 4 raw words and 4 sentences\n", + "2017-05-07 14:19:35,579 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:35,582 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)\n", + "2017-05-07 14:19:35,587 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)\n", + "2017-05-07 14:19:35,588 : INFO : deleting the raw counts dictionary of 3 items\n", + "2017-05-07 14:19:35,589 : INFO : sample=0.001 downsamples 3 most-common words\n", + "2017-05-07 14:19:35,590 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)\n", + "2017-05-07 14:19:35,594 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes\n", + "2017-05-07 14:19:35,595 : INFO : resetting layer weights\n", + "2017-05-07 14:19:35,598 : INFO : training model with 3 workers on 3 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:35,603 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:35,605 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:35,606 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:35,607 : INFO : training on 20 raw words (0 effective words) took 0.0s, 0 effective words/s\n", + "2017-05-07 14:19:35,609 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] } ], @@ -174,18 +177,14 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Word2Vec(vocab=3, size=100, alpha=0.025)\n", - "{'second': , 'sentence': , 'first': }\n" + "{'second': , 'first': , 'sentence': }\n" ] } ], @@ -196,10 +195,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Say we want to further preprocess the words from the files — convert to unicode, lowercase, remove numbers, extract named entities… All of this can be done inside the `MySentences` iterator and `word2vec` doesn’t need to know. All that is required is that the input yields one sentence (list of utf8 words) after another.\n", "\n", @@ -213,17 +209,29 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + "2017-05-07 14:19:35,636 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:35,638 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:35,640 : INFO : collected 3 word types from a corpus of 4 raw words and 4 sentences\n", + "2017-05-07 14:19:35,641 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:35,644 : INFO : min_count=1 retains 3 unique words (100% of original 3, drops 0)\n", + "2017-05-07 14:19:35,645 : INFO : min_count=1 leaves 4 word corpus (100% of original 4, drops 0)\n", + "2017-05-07 14:19:35,646 : INFO : deleting the raw counts dictionary of 3 items\n", + "2017-05-07 14:19:35,647 : INFO : sample=0.001 downsamples 3 most-common words\n", + "2017-05-07 14:19:35,649 : INFO : downsampling leaves estimated 0 word corpus (5.7% of prior 4)\n", + "2017-05-07 14:19:35,650 : INFO : estimated required memory for 3 words and 100 dimensions: 3900 bytes\n", + "2017-05-07 14:19:35,651 : INFO : resetting layer weights\n", + "2017-05-07 14:19:35,653 : INFO : training model with 3 workers on 3 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:35,656 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:35,657 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:35,658 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:35,660 : INFO : training on 20 raw words (0 effective words) took 0.0s, 0 effective words/s\n", + "2017-05-07 14:19:35,662 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] }, { @@ -248,18 +256,14 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Word2Vec(vocab=3, size=100, alpha=0.025)\n", - "{'second': , 'sentence': , 'first': }\n" + "{'second': , 'first': , 'sentence': }\n" ] } ], @@ -270,10 +274,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## More data would be nice\n", "For the following examples, we'll use the [Lee Corpus](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/lee_background.cor) (which you already have if you've installed gensim):" @@ -283,9 +284,7 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -297,17 +296,13 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "<__main__.MyText object at 0x7f5edcd03b90>\n" + "<__main__.MyText object at 0x106c65b50>\n" ] } ], @@ -325,10 +320,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Training\n", "`Word2Vec` accepts several parameters that affect both training speed and quality.\n", @@ -339,26 +331,73 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:35,718 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:35,721 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:35,765 : INFO : collected 10186 word types from a corpus of 59890 raw words and 300 sentences\n", + "2017-05-07 14:19:35,766 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:35,787 : INFO : min_count=10 retains 806 unique words (7% of original 10186, drops 9380)\n", + "2017-05-07 14:19:35,789 : INFO : min_count=10 leaves 40964 word corpus (68% of original 59890, drops 18926)\n", + "2017-05-07 14:19:35,795 : INFO : deleting the raw counts dictionary of 10186 items\n", + "2017-05-07 14:19:35,799 : INFO : sample=0.001 downsamples 54 most-common words\n", + "2017-05-07 14:19:35,802 : INFO : downsampling leaves estimated 26224 word corpus (64.0% of prior 40964)\n", + "2017-05-07 14:19:35,804 : INFO : estimated required memory for 806 words and 100 dimensions: 1047800 bytes\n", + "2017-05-07 14:19:35,812 : INFO : resetting layer weights\n", + "2017-05-07 14:19:35,834 : INFO : training model with 3 workers on 806 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:36,106 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:36,110 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:36,112 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:36,113 : INFO : training on 299450 raw words (131202 effective words) took 0.3s, 478707 effective words/s\n" + ] + } + ], "source": [ "# default value of min_count=5\n", "model = gensim.models.Word2Vec(sentences, min_count=10)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`size` is the number of dimensions (N) of the N-dimensional space that gensim Word2Vec maps the words onto.\n", + "\n", + "Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds." + ] + }, { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:36,122 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:36,125 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:36,159 : INFO : collected 10186 word types from a corpus of 59890 raw words and 300 sentences\n", + "2017-05-07 14:19:36,161 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:36,173 : INFO : min_count=5 retains 1723 unique words (16% of original 10186, drops 8463)\n", + "2017-05-07 14:19:36,175 : INFO : min_count=5 leaves 46858 word corpus (78% of original 59890, drops 13032)\n", + "2017-05-07 14:19:36,186 : INFO : deleting the raw counts dictionary of 10186 items\n", + "2017-05-07 14:19:36,188 : INFO : sample=0.001 downsamples 49 most-common words\n", + "2017-05-07 14:19:36,190 : INFO : downsampling leaves estimated 32849 word corpus (70.1% of prior 46858)\n", + "2017-05-07 14:19:36,193 : INFO : estimated required memory for 1723 words and 200 dimensions: 3618300 bytes\n", + "2017-05-07 14:19:36,207 : INFO : resetting layer weights\n", + "2017-05-07 14:19:36,246 : INFO : training model with 3 workers on 1723 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:36,485 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:36,486 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:36,490 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:36,491 : INFO : training on 299450 raw words (164316 effective words) took 0.2s, 686188 effective words/s\n" + ] + } + ], "source": [ "# default value of size=100\n", "model = gensim.models.Word2Vec(sentences, size=200)" @@ -366,30 +405,38 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ - "Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds.\n", - "\n", "The last of the major parameters (full list [here](http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)) is for training parallelization, to speed up training:" ] }, { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + "2017-05-07 14:19:36,501 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:36,503 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:36,542 : INFO : collected 10186 word types from a corpus of 59890 raw words and 300 sentences\n", + "2017-05-07 14:19:36,545 : INFO : Loading a fresh vocabulary\n", + "2017-05-07 14:19:36,561 : INFO : min_count=5 retains 1723 unique words (16% of original 10186, drops 8463)\n", + "2017-05-07 14:19:36,564 : INFO : min_count=5 leaves 46858 word corpus (78% of original 59890, drops 13032)\n", + "2017-05-07 14:19:36,574 : INFO : deleting the raw counts dictionary of 10186 items\n", + "2017-05-07 14:19:36,580 : INFO : sample=0.001 downsamples 49 most-common words\n", + "2017-05-07 14:19:36,582 : INFO : downsampling leaves estimated 32849 word corpus (70.1% of prior 46858)\n", + "2017-05-07 14:19:36,583 : INFO : estimated required memory for 1723 words and 100 dimensions: 2239900 bytes\n", + "2017-05-07 14:19:36,598 : INFO : resetting layer weights\n", + "2017-05-07 14:19:36,631 : INFO : training model with 4 workers on 1723 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:36,792 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2017-05-07 14:19:36,794 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:36,795 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:36,801 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:36,802 : INFO : training on 299450 raw words (164316 effective words) took 0.2s, 979062 effective words/s\n", + "2017-05-07 14:19:36,805 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] } ], @@ -400,20 +447,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The `workers` parameter only has an effect if you have [Cython](http://cython.org/) installed. Without Cython, you’ll only be able to use one core because of the [GIL](https://wiki.python.org/moin/GlobalInterpreterLock) (and `word2vec` training will be [miserably slow](http://rare-technologies.com/word2vec-in-python-part-two-optimizing/))." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Memory\n", "At its core, `word2vec` model parameters are stored as matrices (NumPy arrays). Each array is **#vocabulary** (controlled by min_count parameter) times **#size** (size parameter) of floats (single precision aka 4 bytes).\n", @@ -425,10 +466,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Evaluating\n", "`Word2Vec` training is an unsupervised task, there’s no good way to objectively evaluate the result. Evaluation depends on your end application.\n", @@ -442,23 +480,31 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ - "Gensim support the same evaluation set, in exactly the same format:" + "Gensim supports the same evaluation set, in exactly the same format:" ] }, { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:36,892 : INFO : precomputing L2-norms of word weight vectors\n", + "2017-05-07 14:19:36,896 : INFO : family: 0.0% (0/2)\n", + "2017-05-07 14:19:36,924 : INFO : gram3-comparative: 0.0% (0/12)\n", + "2017-05-07 14:19:36,935 : INFO : gram4-superlative: 0.0% (0/12)\n", + "2017-05-07 14:19:36,949 : INFO : gram5-present-participle: 5.0% (1/20)\n", + "2017-05-07 14:19:36,967 : INFO : gram6-nationality-adjective: 0.0% (0/20)\n", + "2017-05-07 14:19:36,983 : INFO : gram7-past-tense: 0.0% (0/20)\n", + "2017-05-07 14:19:36,998 : INFO : gram8-plural: 0.0% (0/12)\n", + "2017-05-07 14:19:37,006 : INFO : total: 1.0% (1/98)\n" + ] + }, { "data": { "text/plain": [ @@ -500,14 +546,13 @@ " (u'LARGE', u'LARGEST', u'GOOD', u'BEST'),\n", " (u'LARGE', u'LARGEST', u'GREAT', u'GREATEST')],\n", " 'section': u'gram4-superlative'},\n", - " {'correct': [],\n", + " {'correct': [(u'LOOK', u'LOOKING', u'SAY', u'SAYING')],\n", " 'incorrect': [(u'GO', u'GOING', u'LOOK', u'LOOKING'),\n", " (u'GO', u'GOING', u'PLAY', u'PLAYING'),\n", " (u'GO', u'GOING', u'RUN', u'RUNNING'),\n", " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", @@ -581,7 +626,7 @@ " (u'MAN', u'MEN', u'CHILD', u'CHILDREN')],\n", " 'section': u'gram8-plural'},\n", " {'correct': [], 'incorrect': [], 'section': u'gram9-plural-verbs'},\n", - " {'correct': [],\n", + " {'correct': [(u'LOOK', u'LOOKING', u'SAY', u'SAYING')],\n", " 'incorrect': [(u'HE', u'SHE', u'HIS', u'HER'),\n", " (u'HIS', u'HER', u'HE', u'SHE'),\n", " (u'GOOD', u'BETTER', u'GREAT', u'GREATER'),\n", @@ -614,7 +659,6 @@ " (u'GO', u'GOING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'PLAY', u'PLAYING'),\n", " (u'LOOK', u'LOOKING', u'RUN', u'RUNNING'),\n", - " (u'LOOK', u'LOOKING', u'SAY', u'SAYING'),\n", " (u'LOOK', u'LOOKING', u'GO', u'GOING'),\n", " (u'PLAY', u'PLAYING', u'RUN', u'RUNNING'),\n", " (u'PLAY', u'PLAYING', u'SAY', u'SAYING'),\n", @@ -694,10 +738,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This `accuracy` takes an \n", "[optional parameter](http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.accuracy) `restrict_vocab` \n", @@ -707,30 +748,32 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In the December 2016 release of Gensim we added a better way to evaluate semantic similarity.\n", "\n", - "By default it uses an academic dataset WS-353 but one can create a dataset specific to your business based on it. It contain word pairs together with human-assigned similarity judgments. It measures the relatedness or co-occurrence of two words. For example, coast and shore are very similar as they appear in the same context. At the same time clothes and closet are less similar because they are related but not interchangeable." + "By default it uses an academic dataset WS-353 but one can create a dataset specific to your business based on it. It contains word pairs together with human-assigned similarity judgments. It measures the relatedness or co-occurrence of two words. For example, 'coast' and 'shore' are very similar as they appear in the same context. At the same time 'clothes' and 'closet' are less similar because they are related but not interchangeable." ] }, { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:37,057 : INFO : Pearson correlation coefficient against /usr/local/lib/python2.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.0819\n", + "2017-05-07 14:19:37,058 : INFO : Spearman rank-order correlation coefficient against /usr/local/lib/python2.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.0825\n", + "2017-05-07 14:19:37,060 : INFO : Pairs with unknown words ratio: 85.6%\n" + ] + }, { "data": { "text/plain": [ - "((0.064272459590938968, 0.65409410348547958),\n", - " (0.041316891146214431, 0.77344654164156579),\n", + "((0.081883159986411394, 0.5678461885290379),\n", + " SpearmanrResult(correlation=0.082498020328092989, pvalue=0.56493264964360379),\n", " 85.55240793201133)" ] }, @@ -745,20 +788,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Once again, **good performance on Google's or WS-353 test set doesn’t mean word2vec will work well in your application, or vice versa**. It’s always best to evaluate directly on your intended task. For an example of how to use word2vec in a classifier pipeline, see this [tutorial](https://github.com/RaRe-Technologies/movie-plots-by-genre)." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Storing and loading models\n", "You can store/load models using the standard gensim methods:" @@ -767,12 +804,19 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:37,075 : INFO : saving Word2Vec object under /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp, separately None\n", + "2017-05-07 14:19:37,078 : INFO : not storing attribute syn0norm\n", + "2017-05-07 14:19:37,079 : INFO : not storing attribute cum_table\n", + "2017-05-07 14:19:37,101 : INFO : saved /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp\n" + ] + } + ], "source": [ "from tempfile import mkstemp\n", "\n", @@ -784,38 +828,41 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:37,107 : INFO : loading Word2Vec object from /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp\n", + "2017-05-07 14:19:37,118 : INFO : loading wv recursively from /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp.wv.* with mmap=None\n", + "2017-05-07 14:19:37,119 : INFO : setting ignored attribute syn0norm to None\n", + "2017-05-07 14:19:37,120 : INFO : setting ignored attribute cum_table to None\n", + "2017-05-07 14:19:37,121 : INFO : loaded /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp\n" + ] + } + ], "source": [ "new_model = gensim.models.Word2Vec.load(temp_path) # open the model" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "which uses pickle internally, optionally `mmap`‘ing the model’s internal large NumPy matrices into virtual memory directly from disk files, for inter-process memory sharing.\n", "\n", "In addition, you can load models created by the original C tool, both using its text and binary formats:\n", - "\n", - " model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)\n", - " # using gzipped/bz2 input works too, no need to unzip:\n", - " model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)" + "```\n", + " model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)\n", + " # using gzipped/bz2 input works too, no need to unzip:\n", + " model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)\n", + "```" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Online training / Resuming training\n", "Advanced users can load a model and continue training it with more sentences and [new vocabulary words](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/online_w2v_tutorial.ipynb):" @@ -824,17 +871,35 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.word2vec:under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + "2017-05-07 14:19:37,137 : INFO : loading Word2Vec object from /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp\n", + "2017-05-07 14:19:37,146 : INFO : loading wv recursively from /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp.wv.* with mmap=None\n", + "2017-05-07 14:19:37,147 : INFO : setting ignored attribute syn0norm to None\n", + "2017-05-07 14:19:37,149 : INFO : setting ignored attribute cum_table to None\n", + "2017-05-07 14:19:37,150 : INFO : loaded /var/folders/4t/xx08nfg15lj77zlfjz69314r0000gn/T/tmpZEHE9Wgensim_temp\n", + "2017-05-07 14:19:37,155 : INFO : collecting all words and their counts\n", + "2017-05-07 14:19:37,156 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-05-07 14:19:37,157 : INFO : collected 13 word types from a corpus of 13 raw words and 1 sentences\n", + "2017-05-07 14:19:37,158 : INFO : Updating model with new vocabulary\n", + "2017-05-07 14:19:37,159 : INFO : New added 0 unique words (0% of original 13)\n", + " and increased the count of 0 pre-existing words (0% of original 13)\n", + "2017-05-07 14:19:37,161 : INFO : deleting the raw counts dictionary of 13 items\n", + "2017-05-07 14:19:37,162 : INFO : sample=0.001 downsamples 0 most-common words\n", + "2017-05-07 14:19:37,163 : INFO : downsampling leaves estimated 0 word corpus (0.0% of prior 0)\n", + "2017-05-07 14:19:37,164 : INFO : estimated required memory for 1723 words and 100 dimensions: 2239900 bytes\n", + "2017-05-07 14:19:37,170 : INFO : updating layer weights\n", + "2017-05-07 14:19:37,172 : INFO : training model with 4 workers on 1723 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-05-07 14:19:37,174 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2017-05-07 14:19:37,176 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-05-07 14:19:37,178 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-05-07 14:19:37,179 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-05-07 14:19:37,180 : INFO : training on 65 raw words (28 effective words) took 0.0s, 4209 effective words/s\n", + "2017-05-07 14:19:37,182 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" ] } ], @@ -852,10 +917,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "You may need to tweak the `total_words` parameter to `train()`, depending on what learning rate decay you want to simulate.\n", "\n", @@ -868,16 +930,19 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-05-07 14:19:37,190 : INFO : precomputing L2-norms of word weight vectors\n" + ] + }, { "data": { "text/plain": [ - "[('ensure', 0.9916089773178101)]" + "[('longer', 0.9884582161903381)]" ] }, "execution_count": 20, @@ -892,17 +957,13 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "WARNING:gensim.models.keyedvectors:vectors for words set(['lunch', 'input', 'cat']) are not present in the model, ignoring these words\n" + "2017-05-07 14:19:37,202 : WARNING : vectors for words set(['lunch', 'input', 'cat']) are not present in the model, ignoring these words\n" ] }, { @@ -923,18 +984,14 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.999128693496\n", - "0.995598721362\n" + "0.999186470298\n", + "0.995724529077\n" ] } ], @@ -945,10 +1002,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "You can get the probability distribution for the center word given the context words as input:" ] @@ -956,17 +1010,13 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[('more', 0.0010214881), ('training', 0.0009804588), ('continue', 0.00094650878), ('can', 0.00092195231), ('it', 0.00089841458), ('australia', 0.00077773805), ('government', 0.00076788972), ('us', 0.00076459395), ('there', 0.00075191096), ('killed', 0.00074792351)]\n" + "[('more', 0.001048518), ('continue', 0.00090946292), ('can', 0.00090134487), ('training', 0.00088478095), ('it', 0.00077986595), ('australia', 0.0007500046), ('there', 0.00074296352), ('government', 0.00074113585), ('could', 0.00073843176), ('or', 0.00073749834)]\n" ] } ], @@ -976,20 +1026,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The results here don't look good because the training corpus is very small. To get meaningful results one needs to train on 500k+ words." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "If you need the raw output vectors in your application, you can access these either on a word-by-word basis:" ] @@ -997,35 +1041,31 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([ 0.00506193, 0.0226855 , -0.02943243, -0.00850953, -0.03299763,\n", - " -0.03874256, 0.00795013, -0.09169962, -0.01347002, -0.02357206,\n", - " 0.02472948, -0.02463134, -0.06745216, -0.02074538, -0.02165207,\n", - " 0.04777974, -0.02944389, -0.00209709, 0.0225853 , -0.02756712,\n", - " -0.06757693, -0.0062337 , 0.06952298, 0.0505537 , 0.02458209,\n", - " 0.0140616 , -0.00495757, 0.0187903 , -0.0156572 , 0.00059901,\n", - " 0.00026355, 0.07304576, 0.00949389, -0.00331612, 0.02460947,\n", - " 0.02132211, -0.04548595, 0.01761133, 0.01257058, -0.06949953,\n", - " -0.07925285, 0.00565318, -0.04476747, -0.02920126, 0.03141577,\n", - " -0.05677001, 0.0391206 , 0.0042906 , -0.01415944, 0.04051396,\n", - " 0.01597693, 0.00671787, -0.03740353, 0.00665488, 0.01475888,\n", - " -0.01941732, 0.05768431, -0.02920702, 0.02015296, -0.03559965,\n", - " -0.02955742, -0.04996177, 0.01774862, -0.031699 , -0.01097541,\n", - " -0.06637666, -0.07993821, 0.03876927, 0.05615626, -0.00116237,\n", - " -0.01270938, 0.00813914, -0.05149486, 0.01389496, -0.04919665,\n", - " -0.05647518, 0.03727042, -0.00600072, 0.04672569, 0.04398456,\n", - " -0.02320013, 0.03545921, -0.01651819, 0.00087945, 0.0174842 ,\n", - " 0.00950102, -0.09364804, -0.08258698, 0.06699577, -0.03158378,\n", - " -0.06168535, -0.04525115, -0.04849502, -0.00481538, -0.02783764,\n", - " -0.02939486, -0.02511807, 0.0215294 , -0.05088007, -0.00214653], dtype=float32)" + "array([ 0.00349002, 0.02440139, -0.02936695, -0.00849617, -0.03318483,\n", + " -0.0382478 , 0.00597728, -0.09292595, -0.01093712, -0.02097394,\n", + " 0.02088499, -0.0280605 , -0.07108893, -0.02044513, -0.02337479,\n", + " 0.04878484, -0.03198365, -0.00347298, 0.02429976, -0.02761379,\n", + " -0.06878174, -0.00695439, 0.06986855, 0.05134906, 0.03044886,\n", + " 0.01195826, -0.00513146, 0.02122262, -0.01519287, 0.00502698,\n", + " 0.00088907, 0.07702309, 0.01296635, -0.00185401, 0.02448723,\n", + " 0.02151101, -0.04088883, 0.01947908, 0.01428026, -0.07242644,\n", + " -0.08013999, 0.00214788, -0.04682875, -0.02618166, 0.03343621,\n", + " -0.05884593, 0.03833489, 0.00581573, -0.01099163, 0.04513358,\n", + " 0.01407813, 0.00823141, -0.03918071, 0.0107606 , 0.01743653,\n", + " -0.01885621, 0.06017725, -0.03312737, 0.02473382, -0.03686444,\n", + " -0.03306546, -0.05434534, 0.01816491, -0.0386038 , -0.01055549,\n", + " -0.06602577, -0.08695736, 0.04147927, 0.05510609, -0.00292372,\n", + " -0.00839636, 0.00660775, -0.04910387, 0.01182455, -0.05183903,\n", + " -0.05662465, 0.03827399, -0.01096484, 0.05027501, 0.04410599,\n", + " -0.02027577, 0.03782682, -0.01756338, 0.00167882, 0.01706443,\n", + " 0.00842514, -0.09443056, -0.0869148 , 0.06825797, -0.02385623,\n", + " -0.06005816, -0.04784475, -0.05084028, -0.00288582, -0.02646183,\n", + " -0.0288031 , -0.0257737 , 0.02252337, -0.05444728, 0.00016777], dtype=float32)" ] }, "execution_count": 24, @@ -1039,10 +1079,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "…or en-masse as a 2D NumPy matrix from `model.wv.syn0`.\n", "\n", @@ -1060,9 +1097,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -1085,9 +1120,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.6" + "version": "2.7.13" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }