Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into fix1294
  • Loading branch information
manneshiva committed May 23, 2017
2 parents 2eef0b6 + a49aa9b commit 690251e
Show file tree
Hide file tree
Showing 35 changed files with 1,956 additions and 450 deletions.
96 changes: 57 additions & 39 deletions docs/notebooks/Corpora_and_Vector_Spaces.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"collapsed": true
},
Expand All @@ -27,6 +27,20 @@
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import tempfile\n",
"TEMP_FOLDER = tempfile.gettempdir()\n",
"print('Folder \"{}\" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -40,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -145,13 +159,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(12 unique tokens: ['response', 'survey', 'computer', 'user', 'minors']...)\n"
"Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
]
}
],
"source": [
"dictionary = corpora.Dictionary(texts)\n",
"dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference\n",
"dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict')) # store the dictionary, for future reference\n",
"print(dictionary)"
]
},
Expand All @@ -173,7 +187,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'response': 3, 'survey': 4, 'computer': 2, 'user': 5, 'minors': 11, 'time': 6, 'system': 7, 'graph': 10, 'interface': 1, 'human': 0, 'eps': 8, 'trees': 9}\n"
"{'human': 0, 'interface': 1, 'computer': 2, 'survey': 3, 'user': 4, 'system': 5, 'response': 6, 'time': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}\n"
]
}
],
Expand Down Expand Up @@ -213,7 +227,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The function `doc2bow()` simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. The sparse vector `[(word_id, 1), (word_id, 1)]` therefore reads: in the document *“Human computer interaction”*, the words *\"computer\"* and *\"human\"*, identified by an integer id given by the built dictionary, appear once; the other ten dictionary words appear (implicitly) zero times. Check their id at the dictionary displayed in the previous cell and see that they match."
"The function `doc2bow()` simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector, in the form of `[(word_id, word_count), ...]`. \n",
"\n",
"As the token_id is 0 for *\"human\"* and 2 for *\"computer\"*, the new document *“Human computer interaction”* will be transformed to [(0, 1), (2, 1)]. The words *\"computer\"* and *\"human\"* exist in the dictionary and appear once. Thus, they become (0, 1), (2, 1) respectively in the sparse vector. The word *\"interaction\"* doesn't exist in the dictionary and, thus, will not show up in the sparse vector. The other ten dictionary words, that appear (implicitly) zero times, will not show up in the sparse vector and , ,there will never be a element in the sparse vector like (3, 0).\n",
"\n",
"For people familiar with scikit learn, `doc2bow()` has similar behaviors as calling `transform()` on [`CountVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). `doc2bow()` can behave like `fit_transform()` as well. For more details, please look at [gensim API Doc](https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2bow)."
]
},
{
Expand All @@ -229,19 +247,19 @@
"text": [
"[(0, 1), (1, 1), (2, 1)]\n",
"[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
"[(1, 1), (5, 1), (7, 1), (8, 1)]\n",
"[(0, 1), (7, 2), (8, 1)]\n",
"[(3, 1), (5, 1), (6, 1)]\n",
"[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
"[(0, 1), (5, 2), (8, 1)]\n",
"[(4, 1), (6, 1), (7, 1)]\n",
"[(9, 1)]\n",
"[(9, 1), (10, 1)]\n",
"[(9, 1), (10, 1), (11, 1)]\n",
"[(4, 1), (10, 1), (11, 1)]\n"
"[(3, 1), (10, 1), (11, 1)]\n"
]
}
],
"source": [
"corpus = [dictionary.doc2bow(text) for text in texts]\n",
"corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use\n",
"corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus) # store to disk, for later use\n",
"for c in corpus:\n",
" print(c)"
]
Expand Down Expand Up @@ -290,7 +308,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"<__main__.MyCorpus object at 0x7f4ad14856a0>\n"
"<__main__.MyCorpus object at 0x000002520A52E0B8>\n"
]
}
],
Expand All @@ -308,7 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {
"collapsed": false
},
Expand All @@ -319,13 +337,13 @@
"text": [
"[(0, 1), (1, 1), (2, 1)]\n",
"[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
"[(1, 1), (5, 1), (7, 1), (8, 1)]\n",
"[(0, 1), (7, 2), (8, 1)]\n",
"[(3, 1), (5, 1), (6, 1)]\n",
"[(1, 1), (4, 1), (5, 1), (8, 1)]\n",
"[(0, 1), (5, 2), (8, 1)]\n",
"[(4, 1), (6, 1), (7, 1)]\n",
"[(9, 1)]\n",
"[(9, 1), (10, 1)]\n",
"[(9, 1), (10, 1), (11, 1)]\n",
"[(4, 1), (10, 1), (11, 1)]\n"
"[(3, 1), (10, 1), (11, 1)]\n"
]
}
],
Expand All @@ -340,12 +358,12 @@
"source": [
"Although the output is the same as for the plain Python list, the corpus is now much more memory friendly, because at most one vector resides in RAM at a time. Your corpus can now be as large as you want.\n",
"\n",
"Similarly, to construct the dictionary without loading all texts into memory:"
"We are going to create the dictionary from the mycorpus.txt file without loading the entire file into memory. Then, we will generate the list of token ids to remove from this dictionary by querying the dictionary for the token ids of the stop words, and by querying the document frequencies dictionary (dictionary.dfs) for token ids that only appear once. Finally, we will filter these token ids out of our dictionary and call dictionary.compactify() to remove the gaps in the token id series."
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {
"collapsed": false
},
Expand All @@ -354,7 +372,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Dictionary(12 unique tokens: ['response', 'computer', 'survey', 'user', 'minors']...)\n"
"Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n"
]
}
],
Expand Down Expand Up @@ -392,36 +410,36 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"# create a toy corpus of 2 documents, as a plain Python list\n",
"corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it\n",
"\n",
"corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)"
"corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Other formats include [Joachim’s SVMlight format](http://svmlight.joachims.org/), [Blei’s LDA-C format](http://www.cs.princeton.edu/~blei/lda-c/) and [GibbsLDA++ format](http://gibbslda.sourceforge.net/)."
"Other formats include [Joachim’s SVMlight format](http://svmlight.joachims.org/), [Blei’s LDA-C format](http://www.cs.columbia.edu/~blei/lda-c/) and [GibbsLDA++ format](http://gibbslda.sourceforge.net/)."
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)\n",
"corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)\n",
"corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)"
"corpora.SvmLightCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.svmlight'), corpus)\n",
"corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)\n",
"corpora.LowCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.low'), corpus)"
]
},
{
Expand All @@ -433,13 +451,13 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"corpus = corpora.MmCorpus('/tmp/corpus.mm')"
"corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'corpus.mm'))"
]
},
{
Expand All @@ -451,7 +469,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"metadata": {
"collapsed": false
},
Expand All @@ -477,7 +495,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {
"collapsed": false
},
Expand All @@ -504,7 +522,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -535,13 +553,13 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)"
"corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)"
]
},
{
Expand All @@ -557,7 +575,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {
"collapsed": false
},
Expand All @@ -579,7 +597,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"metadata": {
"collapsed": false
},
Expand Down
Loading

0 comments on commit 690251e

Please sign in to comment.