From 2dcaaf80f4fb8023acc2f118b0966d92fca9500e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 3 Oct 2020 23:47:21 +0900 Subject: [PATCH] remove gensim.summarization subpackage, docs and test data (#2958) * remove gensim.summarization subpackage, docs and test data * Update changelog * remove old import * Remove distance metrics and pivoted normalization tutorials --- CHANGELOG.md | 4 + docs/src/apiref.rst | 9 - docs/src/auto_examples/index.rst | 152 +- .../tutorials/run_distance_metrics.ipynb | 269 -- .../tutorials/run_distance_metrics.py | 337 --- .../tutorials/run_distance_metrics.py.md5 | 1 - .../tutorials/run_distance_metrics.rst | 577 ---- .../tutorials/run_pivoted_doc_norm.ipynb | 194 -- .../tutorials/run_pivoted_doc_norm.py | 243 -- .../tutorials/run_pivoted_doc_norm.py.md5 | 1 - .../tutorials/run_pivoted_doc_norm.rst | 439 --- .../tutorials/run_summarization.ipynb | 331 --- .../tutorials/run_summarization.py | 243 -- .../tutorials/run_summarization.py.md5 | 1 - .../tutorials/run_summarization.rst | 2359 ----------------- .../tutorials/sg_execution_times.rst | 32 +- docs/src/conf.py | 2 - docs/src/gallery/other/README.txt | 1 - .../gallery/tutorials/run_distance_metrics.py | 337 --- .../gallery/tutorials/run_pivoted_doc_norm.py | 243 -- .../gallery/tutorials/run_summarization.py | 243 -- docs/src/summarization/bm25.rst | 9 - docs/src/summarization/commons.rst | 8 - docs/src/summarization/graph.rst | 8 - docs/src/summarization/keywords.rst | 9 - docs/src/summarization/mz_entropy.rst | 9 - docs/src/summarization/pagerank_weighted.rst | 9 - docs/src/summarization/summariser.rst | 9 - docs/src/summarization/syntactic_unit.rst | 9 - docs/src/summarization/textcleaner.rst | 10 - gensim/__init__.py | 2 +- gensim/summarization/__init__.py | 5 - gensim/summarization/bm25.py | 375 --- gensim/summarization/commons.py | 68 - gensim/summarization/graph.py | 401 --- gensim/summarization/keywords.py | 547 ---- gensim/summarization/mz_entropy.py | 151 -- gensim/summarization/pagerank_weighted.py | 190 -- gensim/summarization/summarizer.py | 443 ---- gensim/summarization/syntactic_unit.py | 53 - gensim/summarization/textcleaner.py | 316 --- gensim/test/test_BM25.py | 156 -- gensim/test/test_data/mihalcea_tarau.kw.txt | 21 - .../test/test_data/mihalcea_tarau.kwpos.txt | 30 - gensim/test/test_data/mihalcea_tarau.summ.txt | 4 - gensim/test/test_data/mihalcea_tarau.txt | 24 - .../test/test_data/testlowdistinctwords.txt | 10 - .../test/test_data/testrepeatedkeywords.txt | 1 - .../test_data/testsummarization_unrelated.txt | 20 - gensim/test/test_keywords.py | 113 - gensim/test/test_summarization.py | 309 --- 51 files changed, 55 insertions(+), 9282 deletions(-) delete mode 100644 docs/src/auto_examples/tutorials/run_distance_metrics.ipynb delete mode 100644 docs/src/auto_examples/tutorials/run_distance_metrics.py delete mode 100644 docs/src/auto_examples/tutorials/run_distance_metrics.py.md5 delete mode 100644 docs/src/auto_examples/tutorials/run_distance_metrics.rst delete mode 100644 docs/src/auto_examples/tutorials/run_pivoted_doc_norm.ipynb delete mode 100644 docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py delete mode 100644 docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py.md5 delete mode 100644 docs/src/auto_examples/tutorials/run_pivoted_doc_norm.rst delete mode 100644 docs/src/auto_examples/tutorials/run_summarization.ipynb delete mode 100644 docs/src/auto_examples/tutorials/run_summarization.py delete mode 100644 docs/src/auto_examples/tutorials/run_summarization.py.md5 delete mode 100644 docs/src/auto_examples/tutorials/run_summarization.rst delete mode 100644 docs/src/gallery/tutorials/run_distance_metrics.py delete mode 100644 docs/src/gallery/tutorials/run_pivoted_doc_norm.py delete mode 100644 docs/src/gallery/tutorials/run_summarization.py delete mode 100644 docs/src/summarization/bm25.rst delete mode 100644 docs/src/summarization/commons.rst delete mode 100644 docs/src/summarization/graph.rst delete mode 100644 docs/src/summarization/keywords.rst delete mode 100644 docs/src/summarization/mz_entropy.rst delete mode 100644 docs/src/summarization/pagerank_weighted.rst delete mode 100644 docs/src/summarization/summariser.rst delete mode 100644 docs/src/summarization/syntactic_unit.rst delete mode 100644 docs/src/summarization/textcleaner.rst delete mode 100644 gensim/summarization/__init__.py delete mode 100644 gensim/summarization/bm25.py delete mode 100644 gensim/summarization/commons.py delete mode 100644 gensim/summarization/graph.py delete mode 100644 gensim/summarization/keywords.py delete mode 100644 gensim/summarization/mz_entropy.py delete mode 100644 gensim/summarization/pagerank_weighted.py delete mode 100644 gensim/summarization/summarizer.py delete mode 100644 gensim/summarization/syntactic_unit.py delete mode 100644 gensim/summarization/textcleaner.py delete mode 100644 gensim/test/test_BM25.py delete mode 100644 gensim/test/test_data/mihalcea_tarau.kw.txt delete mode 100644 gensim/test/test_data/mihalcea_tarau.kwpos.txt delete mode 100644 gensim/test/test_data/mihalcea_tarau.summ.txt delete mode 100644 gensim/test/test_data/mihalcea_tarau.txt delete mode 100644 gensim/test/test_data/testlowdistinctwords.txt delete mode 100644 gensim/test/test_data/testrepeatedkeywords.txt delete mode 100644 gensim/test/test_data/testsummarization_unrelated.txt delete mode 100644 gensim/test/test_keywords.py delete mode 100644 gensim/test/test_summarization.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fe9c060e1..52bf5a989f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,10 @@ This release contains a major refactoring. * Clear up LdaModel documentation - remove claim that it accepts CSC matrix as input (PR [#2832](https://github.com/RaRe-Technologies/gensim/pull/2832), [@FyzHsn](https://github.com/FyzHsn)) * Fix "generator" language in word2vec docs (PR [#2935](https://github.com/RaRe-Technologies/gensim/pull/2935), __[@polm](https://github.com/polm)__) +### :warning: Removed functionality + + * Remove gensim.summarization subpackage, docs and test data (PR [#2958](https://github.com/RaRe-Technologies/gensim/pull/2958), __[@mpenkov](https://github.com/mpenkov)__) + ## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above ## 3.8.3, 2020-05-03 diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 1cc190e677..8f5e8fc61e 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -95,12 +95,3 @@ Modules: scripts/segment_wiki parsing/porter parsing/preprocessing - summarization/bm25 - summarization/commons - summarization/graph - summarization/keywords - summarization/mz_entropy - summarization/pagerank_weighted - summarization/summariser - summarization/syntactic_unit - summarization/textcleaner diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index bedfe02ea3..cbe06b6fb5 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -13,7 +13,7 @@ If you're thinking about contributing documentation, please see :ref:`sphx_glr_a .. raw:: html -
+
@@ -33,10 +33,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :alt: Core Concepts + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html @@ -54,10 +53,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :alt: Corpora and Vector Spaces + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html @@ -75,10 +73,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :alt: Topics and Transformations + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html @@ -96,10 +93,9 @@ Understanding this functionality is vital for using gensim effectively. .. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :alt: Similarity Queries + .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html @@ -112,7 +108,7 @@ Understanding this functionality is vital for using gensim effectively. /auto_examples/core/run_similarity_queries .. raw:: html -
+
@@ -131,10 +127,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :alt: Word2Vec Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html @@ -152,10 +147,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :alt: Doc2Vec Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html @@ -173,10 +167,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -194,10 +187,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :alt: Fast Similarity Queries with Annoy and Word2Vec + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html @@ -215,10 +207,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :alt: LDA Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html @@ -230,37 +221,15 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod /auto_examples/tutorials/run_lda -.. raw:: html - -
- -.. only:: html - - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_distance_metrics_thumb.png - :alt: Distance Metrics - - :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /auto_examples/tutorials/run_distance_metrics - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -271,51 +240,9 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod :hidden: /auto_examples/tutorials/run_wmd - -.. raw:: html - -
- -.. only:: html - - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_summarization_thumb.png - :alt: Text Summarization - - :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /auto_examples/tutorials/run_summarization - -.. raw:: html - -
- -.. only:: html - - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_pivoted_doc_norm_thumb.png - :alt: Pivoted Document Length Normalization - - :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` - -.. raw:: html - -
- - -.. toctree:: - :hidden: - - /auto_examples/tutorials/run_pivoted_doc_norm .. raw:: html -
+
@@ -334,10 +261,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :alt: How to download pre-trained models and corpora + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html @@ -355,10 +281,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :alt: How to Author Gensim Documentation + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html @@ -376,10 +301,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :alt: How to reproduce the doc2vec 'Paragraph Vector' paper + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html @@ -397,10 +321,9 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :alt: How to Compare LDA Models + .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html @@ -413,7 +336,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u /auto_examples/howtos/run_compare_lda .. raw:: html -
+
@@ -440,7 +363,6 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from - ? `Colouring words by topic in a document, print words in a topics `__ - ? `Topic Coherence, a metric that correlates that human judgement on topic quality. `__ - - ? `Compare topics and documents using Jaccard, Kullback-Leibler and Hellinger similarities `__ - ? `America's Next Topic Model slides `__ - How to choose your next topic model, presented at Pydata Berlin 10 August 2016 by Lev Konstantinovsky - ? `Dynamic Topic Modeling and Dynamic Influence Model Tutorial `__ @@ -457,7 +379,7 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. raw:: html -
+
@@ -467,15 +389,15 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from :class: sphx-glr-footer-gallery - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-download - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/run_distance_metrics.ipynb b/docs/src/auto_examples/tutorials/run_distance_metrics.ipynb deleted file mode 100644 index 994d38cb3f..0000000000 --- a/docs/src/auto_examples/tutorials/run_distance_metrics.ipynb +++ /dev/null @@ -1,269 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\nDistance Metrics\n================\n\nIntroduces the concept of distance between document representations, and demonstrates its calculation using Gensim.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you simply want to calculate the similarity between documents, then you\nmay want to check out the `Similarity Queries Tutorial\n`_ and the `API reference for similarities\n`_. The current\ntutorial shows the building block of these larger methods, which are a small\nsuite of distance metrics.\n\nHere's a brief summary of this tutorial:\n\n1. Set up a small corpus consisting of documents belonging to one of two topics\n2. Train an LDA model to distinguish between the two topics\n3. Use the model to obtain distributions for some sample words\n4. Compare the distributions to each other using a variety of distance metrics:\n\n * Hellinger distance\n * Jaccard coefficient\n\n5. Discuss the concept of distance metrics in slightly more detail\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.corpora import Dictionary\n\n# you can use any corpus, this is just illustratory\ntexts = [\n ['bank', 'river', 'shore', 'water'],\n ['river', 'water', 'flow', 'fast', 'tree'],\n ['bank', 'water', 'fall', 'flow'],\n ['bank', 'bank', 'water', 'rain', 'river'],\n ['river', 'water', 'mud', 'tree'],\n ['money', 'transaction', 'bank', 'finance'],\n ['bank', 'borrow', 'money'],\n ['bank', 'finance'],\n ['finance', 'money', 'sell', 'bank'],\n ['borrow', 'sell'],\n ['bank', 'loan', 'sell'],\n]\n\ndictionary = Dictionary(texts)\ncorpus = [dictionary.doc2bow(text) for text in texts]\n\nimport numpy\nnumpy.random.seed(1) # setting random seed to get the same results each time.\n\nfrom gensim.models import ldamodel\nmodel = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, minimum_probability=1e-8)\nmodel.show_topics()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's call the 1st topic the **water** topic and the second topic the **finance** topic.\n\nLet's take a few sample documents and get them ready to test our distance functions.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "doc_water = ['river', 'water', 'shore']\ndoc_finance = ['finance', 'money', 'sell']\ndoc_bank = ['finance', 'bank', 'tree', 'water']\n\n# Now let's transform these into a bag of words format.\nbow_water = model.id2word.doc2bow(doc_water)\nbow_finance = model.id2word.doc2bow(doc_finance)\nbow_bank = model.id2word.doc2bow(doc_bank)\n\n# We can now get the LDA topic distributions for these.\nlda_bow_water = model[bow_water]\nlda_bow_finance = model[bow_finance]\nlda_bow_bank = model[bow_bank]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Hellinger\n---------\n\nWe're now ready to apply our distance metrics.\nThese metrics return a value between 0 and 1, where values closer to 0 indicate a\nsmaller distance and therefore a larger similarity.\n\nLet's start with the popular Hellinger distance.\n\nThe Hellinger distance metric is symmetric and gives an output in the range [0,1]\nfor two probability distributions. Values closer to 0 mean \"more similar\".\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.matutils import hellinger\nprint(hellinger(lda_bow_water, lda_bow_finance))\nprint(hellinger(lda_bow_finance, lda_bow_bank))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5.\n\nIn the second case, the documents are a lot more semantically similar, so their distance is lower.\n\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In our previous examples we saw that there were lower distance values between\n``bank`` and ``finance`` than for ``bank`` and ``water``, even if it wasn't by a huge margin.\nWhat does this mean?\n\nThe ``bank`` document is a combination of both water and finance related\nterms - but as bank in this context is likely to belong to the finance topic,\nthe distance values are less between the finance and bank bows.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# just to confirm our suspicion that the bank bow is more to do with finance:\nmodel.get_document_topics(bow_bank)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's evident that while it isn't too skewed, it it more towards the finance topic.\n\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jaccard coefficient\n-------------------\n\nLet's now look at the `Jaccard Distance\n`_ (also Jaccard index, Jaccard coefficient)\nfor calculating the similarity between two documents represented as two bags-of-words vectors.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.matutils import jaccard\n\nprint(jaccard(bow_water, bow_bank))\nprint(jaccard(doc_water, doc_bank))\nprint(jaccard(['word'], ['word']))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The three examples above feature 2 different input methods.\n\nIn the first case, we present document vectors already in bag of\nwords format. The distance can be defined as 1 minus the size of the\nintersection upon the size of the union of the vectors.\n\nWe can see (on manual inspection as well), that the distance is likely to be\nhigh - and it is.\n\nThe last two examples illustrate the ability for Jaccard distance to accept even lists\nof words (i.e, documents) as inputs.\n\nIn the last case, because they are the same vectors, so the value returned is 0\n- this means the distance is 0 and the two documents are identical.\n\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Distance Metrics for Topic Distributions\n----------------------------------------\n\nWhile there are already standard methods to identify similarity of documents,\nour distance metrics has one more interesting use-case: topic distributions.\n\nLet's say we want to find out how similar our two topics are, ``water`` and ``finance``.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "topic_water, topic_finance = model.show_topics()\n\n# Preprocess to get the topics in a format accepted by our distance metric functions.\n\ndef parse_topic_string(topic):\n \"\"\"Split a string returned by model.show_topics() into topics and their probabilities.\"\"\"\n topic = topic.split('+')\n topic_bow = []\n for word in topic:\n # split the probability from word\n prob, word = word.split('*')\n # get rid of spaces and quote marks\n word = word.replace(\" \", \"\").replace('\"', '')\n # convert the word (string) to its dictionary index (int)\n word = model.id2word.token2id[word]\n topic_bow.append((word, float(prob)))\n return topic_bow\n\nfinance_distribution = parse_topic_string(topic_finance[1])\nwater_distribution = parse_topic_string(topic_water[1])\n\n# the finance topic in the bag-of-words format looks like this:\nprint(finance_distribution)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we've got our topics in a format acceptable by our functions,\nlet's use a Distance metric to see how similar the word distributions in the\ntopics are.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(hellinger(water_distribution, finance_distribution))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our value of roughly 0.36 means that the topics are not TOO distant with\nrespect to their word distributions.\n\nThis makes sense again, because of overlapping words like ``bank`` and a\nsmall size dictionary.\n\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What are Distance Metrics?\n--------------------------\n\nHaving seen the practical usages of these measures (i.e, to find similarity),\nlet's learn a little about what exactly Distance Measures and Metrics are.\n\nThere\nare 4 conditons for for a distance measure to be a metric:\n\n1. d(x,y) >= 0\n2. d(x,y) = 0 <=> x = y\n3. d(x,y) = d(y,x)\n4. d(x,z) <= d(x,y) + d(y,z)\n\nThat is: it must be non-negative; if x and y are the same, distance must be\nzero; it must be symmetric; and it must obey the triangle inequality law.\n\nSimple enough, right?\n\nLet's test these out for our measures.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# ormal Hellinger distance.\na = hellinger(water_distribution, finance_distribution)\nb = hellinger(finance_distribution, water_distribution)\nprint(a)\nprint(b)\nprint(a == b)\n\n# If we pass the same values, it is zero.\nprint(hellinger(water_distribution, water_distribution))\n\n# For triangle inequality let's use LDA document distributions.\nprint(hellinger(lda_bow_finance, lda_bow_bank))\n\n# Triangle inequality works too!\nprint(hellinger(lda_bow_finance, lda_bow_water) + hellinger(lda_bow_water, lda_bow_bank))\n\n\n# For a nice review of the mathematical differences between the Hellinger distance and\n# Kullback-Leibler divergence, see for example `here\n# `__.\n#" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Visualizing Distance Metrics\n----------------------------\n\nLet's plot a graph of our toy dataset using the popular `networkx\n`_ library.\n\nEach node will be a document, where the color of the node will be its topic\naccording to the LDA model. Edges will connect documents to each other, where\nthe *weight* of the edge will be inversely proportional to the Jaccard\nsimilarity between two documents. We will also annotate the edges to further\naid visualization: **strong** edges will connect similar documents, and\n**weak (dashed)** edges will connect dissimilar documents.\n\nIn summary, similar documents will be closer together, different documents\nwill be further apart.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import itertools\nimport networkx as nx\n\ndef get_most_likely_topic(doc):\n bow = model.id2word.doc2bow(doc)\n topics, probabilities = zip(*model.get_document_topics(bow))\n max_p = max(probabilities)\n topic = topics[probabilities.index(max_p)]\n return topic\n\ndef get_node_color(i):\n return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink'\n\nG = nx.Graph()\nfor i, _ in enumerate(texts):\n G.add_node(i)\n\nfor (i1, i2) in itertools.combinations(range(len(texts)), 2):\n bow1, bow2 = texts[i1], texts[i2]\n distance = jaccard(bow1, bow2)\n G.add_edge(i1, i2, weight=1/distance)\n\n#\n# https://networkx.github.io/documentation/networkx-1.9/examples/drawing/weighted_graph.html\n#\npos = nx.spring_layout(G)\n\nthreshold = 1.25\nelarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > threshold]\nesmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= threshold]\n\nnode_colors = [get_node_color(i) for (i, _) in enumerate(texts)]\nnx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors)\nnx.draw_networkx_edges(G, pos, edgelist=elarge, width=2)\nnx.draw_networkx_edges(G, pos, edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed')\nnx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can make several observations from this graph.\n\nFirst, the graph consists of two connected components (if you ignore the weak edges).\nNodes 0, 1, 2, 3, 4 (which all belong to the water topic) form the first connected component.\nThe other nodes, which all belong to the finance topic, form the second connected component.\n\nSecond, the LDA model didn't do a very good job of classifying our documents into topics.\nThere were many misclassifications, as you can confirm in the summary below:\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print('id\\ttopic\\tdoc')\nfor i, t in enumerate(texts):\n print(f'{i}\\t{get_most_likely_topic(t)}\\t{\" \".join(t)}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is mostly because the corpus used to train the LDA model is so small.\nUsing a larger corpus should hopefully give better results, but that is beyond\nthe scope of this tutorial.\n\nConclusion\n----------\n\nThat brings us to the end of this small tutorial.\nTo recap, here's what we covered:\n\n1. Set up a small corpus consisting of documents belonging to one of two topics\n2. Train an LDA model to distinguish between the two topics\n3. Use the model to obtain distributions for some sample words\n4. Compare the distributions to each other using the distance metrics of Hellinger distance and Jaccard index\n5. Discuss the concept of distance metrics in slightly more detail\n\nThe scope for adding new similarity metrics is large, as there exist an even\nlarger suite of metrics and methods to add to the matutils.py file.\nFor more details, see `Similarity Measures for Text Document Clustering\n`_\nby A. Huang.\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_distance_metrics.py b/docs/src/auto_examples/tutorials/run_distance_metrics.py deleted file mode 100644 index 30567500ba..0000000000 --- a/docs/src/auto_examples/tutorials/run_distance_metrics.py +++ /dev/null @@ -1,337 +0,0 @@ -r""" -Distance Metrics -================ - -Introduces the concept of distance between document representations, and demonstrates its calculation using Gensim. - -""" - -import logging -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -############################################################################### -# If you simply want to calculate the similarity between documents, then you -# may want to check out the `Similarity Queries Tutorial -# `_ and the `API reference for similarities -# `_. The current -# tutorial shows the building block of these larger methods, which are a small -# suite of distance metrics. -# -# Here's a brief summary of this tutorial: -# -# 1. Set up a small corpus consisting of documents belonging to one of two topics -# 2. Train an LDA model to distinguish between the two topics -# 3. Use the model to obtain distributions for some sample words -# 4. Compare the distributions to each other using a variety of distance metrics: -# -# * Hellinger distance -# * Jaccard coefficient -# -# 5. Discuss the concept of distance metrics in slightly more detail -# -from gensim.corpora import Dictionary - -# you can use any corpus, this is just illustratory -texts = [ - ['bank', 'river', 'shore', 'water'], - ['river', 'water', 'flow', 'fast', 'tree'], - ['bank', 'water', 'fall', 'flow'], - ['bank', 'bank', 'water', 'rain', 'river'], - ['river', 'water', 'mud', 'tree'], - ['money', 'transaction', 'bank', 'finance'], - ['bank', 'borrow', 'money'], - ['bank', 'finance'], - ['finance', 'money', 'sell', 'bank'], - ['borrow', 'sell'], - ['bank', 'loan', 'sell'], -] - -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - -import numpy -numpy.random.seed(1) # setting random seed to get the same results each time. - -from gensim.models import ldamodel -model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, minimum_probability=1e-8) -model.show_topics() - -############################################################################### -# Let's call the 1st topic the **water** topic and the second topic the **finance** topic. -# -# Let's take a few sample documents and get them ready to test our distance functions. -# -doc_water = ['river', 'water', 'shore'] -doc_finance = ['finance', 'money', 'sell'] -doc_bank = ['finance', 'bank', 'tree', 'water'] - -# Now let's transform these into a bag of words format. -bow_water = model.id2word.doc2bow(doc_water) -bow_finance = model.id2word.doc2bow(doc_finance) -bow_bank = model.id2word.doc2bow(doc_bank) - -# We can now get the LDA topic distributions for these. -lda_bow_water = model[bow_water] -lda_bow_finance = model[bow_finance] -lda_bow_bank = model[bow_bank] - -############################################################################### -# Hellinger -# --------- -# -# We're now ready to apply our distance metrics. -# These metrics return a value between 0 and 1, where values closer to 0 indicate a -# smaller distance and therefore a larger similarity. -# -# Let's start with the popular Hellinger distance. -# -# The Hellinger distance metric is symmetric and gives an output in the range [0,1] -# for two probability distributions. Values closer to 0 mean "more similar". -# -from gensim.matutils import hellinger -print(hellinger(lda_bow_water, lda_bow_finance)) -print(hellinger(lda_bow_finance, lda_bow_bank)) - -############################################################################### -# Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. -# -# In the second case, the documents are a lot more semantically similar, so their distance is lower. -# - -############################################################################### -# -# In our previous examples we saw that there were lower distance values between -# ``bank`` and ``finance`` than for ``bank`` and ``water``, even if it wasn't by a huge margin. -# What does this mean? -# -# The ``bank`` document is a combination of both water and finance related -# terms - but as bank in this context is likely to belong to the finance topic, -# the distance values are less between the finance and bank bows. -# - -# just to confirm our suspicion that the bank bow is more to do with finance: -model.get_document_topics(bow_bank) - -############################################################################### -# -# It's evident that while it isn't too skewed, it it more towards the finance topic. -# - -############################################################################### -# Jaccard coefficient -# ------------------- -# -# Let's now look at the `Jaccard Distance -# `_ (also Jaccard index, Jaccard coefficient) -# for calculating the similarity between two documents represented as two bags-of-words vectors. -# -from gensim.matutils import jaccard - -print(jaccard(bow_water, bow_bank)) -print(jaccard(doc_water, doc_bank)) -print(jaccard(['word'], ['word'])) - -############################################################################### -# The three examples above feature 2 different input methods. -# -# In the first case, we present document vectors already in bag of -# words format. The distance can be defined as 1 minus the size of the -# intersection upon the size of the union of the vectors. -# -# We can see (on manual inspection as well), that the distance is likely to be -# high - and it is. -# -# The last two examples illustrate the ability for Jaccard distance to accept even lists -# of words (i.e, documents) as inputs. -# -# In the last case, because they are the same vectors, so the value returned is 0 -# - this means the distance is 0 and the two documents are identical. -# - -############################################################################### -# -# Distance Metrics for Topic Distributions -# ---------------------------------------- -# -# While there are already standard methods to identify similarity of documents, -# our distance metrics has one more interesting use-case: topic distributions. -# -# Let's say we want to find out how similar our two topics are, ``water`` and ``finance``. -# -topic_water, topic_finance = model.show_topics() - -# Preprocess to get the topics in a format accepted by our distance metric functions. - -def parse_topic_string(topic): - """Split a string returned by model.show_topics() into topics and their probabilities.""" - topic = topic.split('+') - topic_bow = [] - for word in topic: - # split the probability from word - prob, word = word.split('*') - # get rid of spaces and quote marks - word = word.replace(" ", "").replace('"', '') - # convert the word (string) to its dictionary index (int) - word = model.id2word.token2id[word] - topic_bow.append((word, float(prob))) - return topic_bow - -finance_distribution = parse_topic_string(topic_finance[1]) -water_distribution = parse_topic_string(topic_water[1]) - -# the finance topic in the bag-of-words format looks like this: -print(finance_distribution) - -############################################################################### -# Now that we've got our topics in a format acceptable by our functions, -# let's use a Distance metric to see how similar the word distributions in the -# topics are. -# -print(hellinger(water_distribution, finance_distribution)) - -############################################################################### -# Our value of roughly 0.36 means that the topics are not TOO distant with -# respect to their word distributions. -# -# This makes sense again, because of overlapping words like ``bank`` and a -# small size dictionary. -# - - -############################################################################### -# What are Distance Metrics? -# -------------------------- -# -# Having seen the practical usages of these measures (i.e, to find similarity), -# let's learn a little about what exactly Distance Measures and Metrics are. -# -# There -# are 4 conditons for for a distance measure to be a metric: -# -# 1. d(x,y) >= 0 -# 2. d(x,y) = 0 <=> x = y -# 3. d(x,y) = d(y,x) -# 4. d(x,z) <= d(x,y) + d(y,z) -# -# That is: it must be non-negative; if x and y are the same, distance must be -# zero; it must be symmetric; and it must obey the triangle inequality law. -# -# Simple enough, right? -# -# Let's test these out for our measures. -# - -# ormal Hellinger distance. -a = hellinger(water_distribution, finance_distribution) -b = hellinger(finance_distribution, water_distribution) -print(a) -print(b) -print(a == b) - -# If we pass the same values, it is zero. -print(hellinger(water_distribution, water_distribution)) - -# For triangle inequality let's use LDA document distributions. -print(hellinger(lda_bow_finance, lda_bow_bank)) - -# Triangle inequality works too! -print(hellinger(lda_bow_finance, lda_bow_water) + hellinger(lda_bow_water, lda_bow_bank)) - - -# For a nice review of the mathematical differences between the Hellinger distance and -# Kullback-Leibler divergence, see for example `here -# `__. -# - - -############################################################################### -# Visualizing Distance Metrics -# ---------------------------- -# -# Let's plot a graph of our toy dataset using the popular `networkx -# `_ library. -# -# Each node will be a document, where the color of the node will be its topic -# according to the LDA model. Edges will connect documents to each other, where -# the *weight* of the edge will be inversely proportional to the Jaccard -# similarity between two documents. We will also annotate the edges to further -# aid visualization: **strong** edges will connect similar documents, and -# **weak (dashed)** edges will connect dissimilar documents. -# -# In summary, similar documents will be closer together, different documents -# will be further apart. -# -import itertools -import networkx as nx - -def get_most_likely_topic(doc): - bow = model.id2word.doc2bow(doc) - topics, probabilities = zip(*model.get_document_topics(bow)) - max_p = max(probabilities) - topic = topics[probabilities.index(max_p)] - return topic - -def get_node_color(i): - return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink' - -G = nx.Graph() -for i, _ in enumerate(texts): - G.add_node(i) - -for (i1, i2) in itertools.combinations(range(len(texts)), 2): - bow1, bow2 = texts[i1], texts[i2] - distance = jaccard(bow1, bow2) - G.add_edge(i1, i2, weight=1/distance) - -# -# https://networkx.github.io/documentation/networkx-1.9/examples/drawing/weighted_graph.html -# -pos = nx.spring_layout(G) - -threshold = 1.25 -elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > threshold] -esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= threshold] - -node_colors = [get_node_color(i) for (i, _) in enumerate(texts)] -nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors) -nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2) -nx.draw_networkx_edges(G, pos, edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed') -nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif') - -############################################################################### -# We can make several observations from this graph. -# -# First, the graph consists of two connected components (if you ignore the weak edges). -# Nodes 0, 1, 2, 3, 4 (which all belong to the water topic) form the first connected component. -# The other nodes, which all belong to the finance topic, form the second connected component. -# -# Second, the LDA model didn't do a very good job of classifying our documents into topics. -# There were many misclassifications, as you can confirm in the summary below: -# -print('id\ttopic\tdoc') -for i, t in enumerate(texts): - print(f'{i}\t{get_most_likely_topic(t)}\t{" ".join(t)}') - -############################################################################### -# This is mostly because the corpus used to train the LDA model is so small. -# Using a larger corpus should hopefully give better results, but that is beyond -# the scope of this tutorial. -# -# Conclusion -# ---------- -# -# That brings us to the end of this small tutorial. -# To recap, here's what we covered: -# -# 1. Set up a small corpus consisting of documents belonging to one of two topics -# 2. Train an LDA model to distinguish between the two topics -# 3. Use the model to obtain distributions for some sample words -# 4. Compare the distributions to each other using the distance metrics of Hellinger distance and Jaccard index -# 5. Discuss the concept of distance metrics in slightly more detail -# -# The scope for adding new similarity metrics is large, as there exist an even -# larger suite of metrics and methods to add to the matutils.py file. -# For more details, see `Similarity Measures for Text Document Clustering -# `_ -# by A. Huang. - diff --git a/docs/src/auto_examples/tutorials/run_distance_metrics.py.md5 b/docs/src/auto_examples/tutorials/run_distance_metrics.py.md5 deleted file mode 100644 index b1809ab224..0000000000 --- a/docs/src/auto_examples/tutorials/run_distance_metrics.py.md5 +++ /dev/null @@ -1 +0,0 @@ -b081e19da5e0d4159a134ca890a81331 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_distance_metrics.rst b/docs/src/auto_examples/tutorials/run_distance_metrics.rst deleted file mode 100644 index 1d1e336d55..0000000000 --- a/docs/src/auto_examples/tutorials/run_distance_metrics.rst +++ /dev/null @@ -1,577 +0,0 @@ -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title - - .. _sphx_glr_auto_examples_tutorials_run_distance_metrics.py: - - -Distance Metrics -================ - -Introduces the concept of distance between document representations, and demonstrates its calculation using Gensim. - - - -.. code-block:: default - - - import logging - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - - - - - - - - -If you simply want to calculate the similarity between documents, then you -may want to check out the `Similarity Queries Tutorial -`_ and the `API reference for similarities -`_. The current -tutorial shows the building block of these larger methods, which are a small -suite of distance metrics. - -Here's a brief summary of this tutorial: - -1. Set up a small corpus consisting of documents belonging to one of two topics -2. Train an LDA model to distinguish between the two topics -3. Use the model to obtain distributions for some sample words -4. Compare the distributions to each other using a variety of distance metrics: - - * Hellinger distance - * Jaccard coefficient - -5. Discuss the concept of distance metrics in slightly more detail - - - -.. code-block:: default - - from gensim.corpora import Dictionary - - # you can use any corpus, this is just illustratory - texts = [ - ['bank', 'river', 'shore', 'water'], - ['river', 'water', 'flow', 'fast', 'tree'], - ['bank', 'water', 'fall', 'flow'], - ['bank', 'bank', 'water', 'rain', 'river'], - ['river', 'water', 'mud', 'tree'], - ['money', 'transaction', 'bank', 'finance'], - ['bank', 'borrow', 'money'], - ['bank', 'finance'], - ['finance', 'money', 'sell', 'bank'], - ['borrow', 'sell'], - ['bank', 'loan', 'sell'], - ] - - dictionary = Dictionary(texts) - corpus = [dictionary.doc2bow(text) for text in texts] - - import numpy - numpy.random.seed(1) # setting random seed to get the same results each time. - - from gensim.models import ldamodel - model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, minimum_probability=1e-8) - model.show_topics() - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - - [(0, '0.207*"bank" + 0.100*"water" + 0.089*"river" + 0.088*"sell" + 0.067*"borrow" + 0.064*"finance" + 0.062*"money" + 0.053*"tree" + 0.045*"flow" + 0.044*"rain"'), (1, '0.142*"bank" + 0.116*"water" + 0.090*"river" + 0.084*"money" + 0.081*"finance" + 0.064*"flow" + 0.055*"transaction" + 0.055*"tree" + 0.053*"fall" + 0.050*"mud"')] - - - -Let's call the 1st topic the **water** topic and the second topic the **finance** topic. - -Let's take a few sample documents and get them ready to test our distance functions. - - - -.. code-block:: default - - doc_water = ['river', 'water', 'shore'] - doc_finance = ['finance', 'money', 'sell'] - doc_bank = ['finance', 'bank', 'tree', 'water'] - - # Now let's transform these into a bag of words format. - bow_water = model.id2word.doc2bow(doc_water) - bow_finance = model.id2word.doc2bow(doc_finance) - bow_bank = model.id2word.doc2bow(doc_bank) - - # We can now get the LDA topic distributions for these. - lda_bow_water = model[bow_water] - lda_bow_finance = model[bow_finance] - lda_bow_bank = model[bow_bank] - - - - - - - - -Hellinger ---------- - -We're now ready to apply our distance metrics. -These metrics return a value between 0 and 1, where values closer to 0 indicate a -smaller distance and therefore a larger similarity. - -Let's start with the popular Hellinger distance. - -The Hellinger distance metric is symmetric and gives an output in the range [0,1] -for two probability distributions. Values closer to 0 mean "more similar". - - - -.. code-block:: default - - from gensim.matutils import hellinger - print(hellinger(lda_bow_water, lda_bow_finance)) - print(hellinger(lda_bow_finance, lda_bow_bank)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.24622682814248142 - 0.007332672705927328 - - - - -Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. - -In the second case, the documents are a lot more semantically similar, so their distance is lower. - - -In our previous examples we saw that there were lower distance values between -``bank`` and ``finance`` than for ``bank`` and ``water``, even if it wasn't by a huge margin. -What does this mean? - -The ``bank`` document is a combination of both water and finance related -terms - but as bank in this context is likely to belong to the finance topic, -the distance values are less between the finance and bank bows. - - - -.. code-block:: default - - - # just to confirm our suspicion that the bank bow is more to do with finance: - model.get_document_topics(bow_bank) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - - [(0, 0.64126813), (1, 0.35873187)] - - - -It's evident that while it isn't too skewed, it it more towards the finance topic. - - -Jaccard coefficient -------------------- - -Let's now look at the `Jaccard Distance -`_ (also Jaccard index, Jaccard coefficient) -for calculating the similarity between two documents represented as two bags-of-words vectors. - - - -.. code-block:: default - - from gensim.matutils import jaccard - - print(jaccard(bow_water, bow_bank)) - print(jaccard(doc_water, doc_bank)) - print(jaccard(['word'], ['word'])) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.8571428571428572 - 0.8333333333333334 - 0.0 - - - - -The three examples above feature 2 different input methods. - -In the first case, we present document vectors already in bag of -words format. The distance can be defined as 1 minus the size of the -intersection upon the size of the union of the vectors. - -We can see (on manual inspection as well), that the distance is likely to be -high - and it is. - -The last two examples illustrate the ability for Jaccard distance to accept even lists -of words (i.e, documents) as inputs. - -In the last case, because they are the same vectors, so the value returned is 0 -- this means the distance is 0 and the two documents are identical. - - -Distance Metrics for Topic Distributions ----------------------------------------- - -While there are already standard methods to identify similarity of documents, -our distance metrics has one more interesting use-case: topic distributions. - -Let's say we want to find out how similar our two topics are, ``water`` and ``finance``. - - - -.. code-block:: default - - topic_water, topic_finance = model.show_topics() - - # Preprocess to get the topics in a format accepted by our distance metric functions. - - def parse_topic_string(topic): - """Split a string returned by model.show_topics() into topics and their probabilities.""" - topic = topic.split('+') - topic_bow = [] - for word in topic: - # split the probability from word - prob, word = word.split('*') - # get rid of spaces and quote marks - word = word.replace(" ", "").replace('"', '') - # convert the word (string) to its dictionary index (int) - word = model.id2word.token2id[word] - topic_bow.append((word, float(prob))) - return topic_bow - - finance_distribution = parse_topic_string(topic_finance[1]) - water_distribution = parse_topic_string(topic_water[1]) - - # the finance topic in the bag-of-words format looks like this: - print(finance_distribution) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - [(0, 0.142), (3, 0.116), (1, 0.09), (11, 0.084), (10, 0.081), (5, 0.064), (12, 0.055), (6, 0.055), (7, 0.053), (9, 0.05)] - - - - -Now that we've got our topics in a format acceptable by our functions, -let's use a Distance metric to see how similar the word distributions in the -topics are. - - - -.. code-block:: default - - print(hellinger(water_distribution, finance_distribution)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.42898539619904935 - - - - -Our value of roughly 0.36 means that the topics are not TOO distant with -respect to their word distributions. - -This makes sense again, because of overlapping words like ``bank`` and a -small size dictionary. - - -What are Distance Metrics? --------------------------- - -Having seen the practical usages of these measures (i.e, to find similarity), -let's learn a little about what exactly Distance Measures and Metrics are. - -There -are 4 conditons for for a distance measure to be a metric: - -1. d(x,y) >= 0 -2. d(x,y) = 0 <=> x = y -3. d(x,y) = d(y,x) -4. d(x,z) <= d(x,y) + d(y,z) - -That is: it must be non-negative; if x and y are the same, distance must be -zero; it must be symmetric; and it must obey the triangle inequality law. - -Simple enough, right? - -Let's test these out for our measures. - - - -.. code-block:: default - - - # ormal Hellinger distance. - a = hellinger(water_distribution, finance_distribution) - b = hellinger(finance_distribution, water_distribution) - print(a) - print(b) - print(a == b) - - # If we pass the same values, it is zero. - print(hellinger(water_distribution, water_distribution)) - - # For triangle inequality let's use LDA document distributions. - print(hellinger(lda_bow_finance, lda_bow_bank)) - - # Triangle inequality works too! - print(hellinger(lda_bow_finance, lda_bow_water) + hellinger(lda_bow_water, lda_bow_bank)) - - - # For a nice review of the mathematical differences between the Hellinger distance and - # Kullback-Leibler divergence, see for example `here - # `__. - # - - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.42898539619904935 - 0.42898539619904935 - True - 0.0 - 0.007332672705927328 - 0.4852296733896022 - - - - -Visualizing Distance Metrics ----------------------------- - -Let's plot a graph of our toy dataset using the popular `networkx -`_ library. - -Each node will be a document, where the color of the node will be its topic -according to the LDA model. Edges will connect documents to each other, where -the *weight* of the edge will be inversely proportional to the Jaccard -similarity between two documents. We will also annotate the edges to further -aid visualization: **strong** edges will connect similar documents, and -**weak (dashed)** edges will connect dissimilar documents. - -In summary, similar documents will be closer together, different documents -will be further apart. - - - -.. code-block:: default - - import itertools - import networkx as nx - - def get_most_likely_topic(doc): - bow = model.id2word.doc2bow(doc) - topics, probabilities = zip(*model.get_document_topics(bow)) - max_p = max(probabilities) - topic = topics[probabilities.index(max_p)] - return topic - - def get_node_color(i): - return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink' - - G = nx.Graph() - for i, _ in enumerate(texts): - G.add_node(i) - - for (i1, i2) in itertools.combinations(range(len(texts)), 2): - bow1, bow2 = texts[i1], texts[i2] - distance = jaccard(bow1, bow2) - G.add_edge(i1, i2, weight=1/distance) - - # - # https://networkx.github.io/documentation/networkx-1.9/examples/drawing/weighted_graph.html - # - pos = nx.spring_layout(G) - - threshold = 1.25 - elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > threshold] - esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= threshold] - - node_colors = [get_node_color(i) for (i, _) in enumerate(texts)] - nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors) - nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2) - nx.draw_networkx_edges(G, pos, edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed') - nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif') - - - - -.. image:: /auto_examples/tutorials/images/sphx_glr_run_distance_metrics_001.png - :alt: run distance metrics - :class: sphx-glr-single-img - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - - {0: Text(0.8139056554148585, -0.1385144383214792, '0'), 1: Text(-0.2609162263425896, 0.8290751405979758, '1'), 2: Text(0.12108432357837688, 0.2521192345954642, '2'), 3: Text(0.7410538444428034, 0.3897745756787509, '3'), 4: Text(0.3571060747096814, 0.8878304053766498, '4'), 5: Text(-0.19185848377519837, -0.4814192613354953, '5'), 6: Text(-0.7484781508847783, -0.5667501948434654, '6'), 7: Text(-0.7291843541015196, 0.3730910679100171, '7'), 8: Text(0.38214490760132847, -0.5064739523722547, '8'), 9: Text(0.07139744748226842, -1.0, '9'), 10: Text(-0.5562550381252319, -0.03873257728616475, '10')} - - - -We can make several observations from this graph. - -First, the graph consists of two connected components (if you ignore the weak edges). -Nodes 0, 1, 2, 3, 4 (which all belong to the water topic) form the first connected component. -The other nodes, which all belong to the finance topic, form the second connected component. - -Second, the LDA model didn't do a very good job of classifying our documents into topics. -There were many misclassifications, as you can confirm in the summary below: - - - -.. code-block:: default - - print('id\ttopic\tdoc') - for i, t in enumerate(texts): - print(f'{i}\t{get_most_likely_topic(t)}\t{" ".join(t)}') - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - id topic doc - 0 0 bank river shore water - 1 0 river water flow fast tree - 2 1 bank water fall flow - 3 0 bank bank water rain river - 4 1 river water mud tree - 5 1 money transaction bank finance - 6 0 bank borrow money - 7 0 bank finance - 8 0 finance money sell bank - 9 0 borrow sell - 10 0 bank loan sell - - - - -This is mostly because the corpus used to train the LDA model is so small. -Using a larger corpus should hopefully give better results, but that is beyond -the scope of this tutorial. - -Conclusion ----------- - -That brings us to the end of this small tutorial. -To recap, here's what we covered: - -1. Set up a small corpus consisting of documents belonging to one of two topics -2. Train an LDA model to distinguish between the two topics -3. Use the model to obtain distributions for some sample words -4. Compare the distributions to each other using the distance metrics of Hellinger distance and Jaccard index -5. Discuss the concept of distance metrics in slightly more detail - -The scope for adding new similarity metrics is large, as there exist an even -larger suite of metrics and methods to add to the matutils.py file. -For more details, see `Similarity Measures for Text Document Clustering -`_ -by A. Huang. - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 1.703 seconds) - -**Estimated memory usage:** 9 MB - - -.. _sphx_glr_download_auto_examples_tutorials_run_distance_metrics.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: run_distance_metrics.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: run_distance_metrics.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.ipynb b/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.ipynb deleted file mode 100644 index db5af8990e..0000000000 --- a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.ipynb +++ /dev/null @@ -1,194 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\nPivoted Document Length Normalization\n=====================================\n\nThis tutorial demonstrates using Pivoted Document Length Normalization to\ncounter the effect of short document bias when working with TfIdf, thereby\nincreasing classification accuracy.\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In many cases, normalizing the tfidf weights for each term favors weights of terms\nof the documents with shorter length. The *pivoted document length normalization* scheme\ncounters the effect of this bias for short documents, by making tfidf independent of the document length.\n\nThis is achieved by *tilting* the normalization curve along a pivot point and slope, which\nmust be defined by the user.\n\nRoughly following the equation:\n\n``pivoted_norm = (1 - slope) * pivot + slope * old_norm``\n\nThis scheme is proposed in the paper `Pivoted Document Length Normalization `_\nby Singhal, Buckley and Mitra.\n\nOverall this approach can increase the accuracy of the model where document lengths are hugely varying across the corpus.\n\nIntroduction\n------------\n\nThis guide demonstrates how to perform pivoted document length normalization.\n\nWe will train a logistic regression model to distinguish between text from two different newsgroups.\n\nOur results will show that using pivoted document length normalization yields a better model (higher classification accuracy).\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "#\n# Download our dataset\n#\nimport gensim.downloader as api\nnws = api.load(\"20-newsgroups\")\n\n#\n# Pick texts from relevant newsgroups, split into training and test set.\n#\ncat1, cat2 = ('sci.electronics', 'sci.space')\n\n#\n# X_* contain the actual texts as strings.\n# Y_* contain labels, 0 for cat1 (sci.electronics) and 1 for cat2 (sci.space)\n#\nX_train = []\nX_test = []\ny_train = []\ny_test = []\n\nfor i in nws:\n if i[\"set\"] == \"train\" and i[\"topic\"] == cat1:\n X_train.append(i[\"data\"])\n y_train.append(0)\n elif i[\"set\"] == \"train\" and i[\"topic\"] == cat2:\n X_train.append(i[\"data\"])\n y_train.append(1)\n elif i[\"set\"] == \"test\" and i[\"topic\"] == cat1:\n X_test.append(i[\"data\"])\n y_test.append(0)\n elif i[\"set\"] == \"test\" and i[\"topic\"] == cat2:\n X_test.append(i[\"data\"])\n y_test.append(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preprocess the data\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.parsing.preprocessing import preprocess_string\nfrom gensim.corpora import Dictionary\n\nid2word = Dictionary([preprocess_string(doc) for doc in X_train])\ntrain_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_train]\ntest_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_test]\n\nprint(len(X_train), len(X_test))\n\n# We perform our analysis on top k documents which is almost top 10% most scored documents\nk = len(X_test) // 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Prepare our evaluation function\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.sklearn_api.tfidf import TfIdfTransformer\nfrom sklearn.linear_model import LogisticRegression\nfrom gensim.matutils import corpus2csc\n\ndef get_tfidf_scores(kwargs):\n \"\"\"\n Return a model's accuracy along with individual document probability values, using\n Gensim's TfIdfTransformer and sklearn's LogisticRegression.\n\n \"\"\"\n tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus)\n\n X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T\n X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T\n\n clf = LogisticRegression().fit(X_train_tfidf, y_train)\n\n model_accuracy = clf.score(X_test_tfidf, y_test)\n doc_scores = clf.decision_function(X_test_tfidf)\n\n return model_accuracy, doc_scores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get TFIDF scores for corpus without pivoted document length normalisation\n-------------------------------------------------------------------------\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "params = {}\nmodel_accuracy, doc_scores = get_tfidf_scores(params)\nprint(model_accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Examine the bias towards shorter documents\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy as np\n\n# Sort the document scores by their scores and return a sorted list\n# of document score and corresponding document lengths.\ndef sort_length_by_score(doc_scores, X_test):\n doc_scores = sorted(enumerate(doc_scores), key=lambda x: x[1])\n doc_leng = np.empty(len(doc_scores))\n\n ds = np.empty(len(doc_scores))\n\n for i, _ in enumerate(doc_scores):\n doc_leng[i] = len(X_test[_[0]])\n ds[i] = _[1]\n\n return ds, doc_leng\n\n\nprint(\n f\"Normal cosine normalisation favors short documents as our top {k} docs have a smaller \"\n f\"mean doc length of {sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} \"\n f\"compared to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}\"\n)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get TFIDF scores for corpus with pivoted document length normalisation\n----------------------------------------------------------------------\n\nTest various values of alpha (slope) and pick the best one.\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "best_model_accuracy = 0\noptimum_slope = 0\nfor slope in np.arange(0, 1.1, 0.1):\n params = {\"pivot\": 10, \"slope\": slope}\n\n model_accuracy, doc_scores = get_tfidf_scores(params)\n\n if model_accuracy > best_model_accuracy:\n best_model_accuracy = model_accuracy\n optimum_slope = slope\n\n print(f\"Score for slope {slope} is {model_accuracy}\")\n\nprint(f\"We get best score of {best_model_accuracy} at slope {optimum_slope}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Evaluate the model with optimum slope\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "params = {\"pivot\": 10, \"slope\": optimum_slope}\nmodel_accuracy, doc_scores = get_tfidf_scores(params)\nprint(model_accuracy)\n\nprint(\n f\"With pivoted normalisation top {k} docs have a mean length of \"\n f\"{sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} which is much \"\n f\"closer to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}\"\n)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Visualizing the pivoted normalization\n-------------------------------------\n\nFrom the plot we can see that when the slope was 1 (i.e. when pivoted normalisation\nwas not applied at all), short documents with length of around 500 had very good scores.\nThis is a bias for short documents. As we varied the value of slope from 1 to 0\nwe introdcued a new bias for long documents to counter the bias caused by\ncosine normalisation. At a certain point we got an optimum value of\nslope (0.5 here) where the overall accuracy of the model was maximized.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as py\n\nbest_model_accuracy = 0\noptimum_slope = 0\n\nw = 2\nh = 2\nf, axarr = py.subplots(h, w, figsize=(15, 7))\n\nit = 0\nfor slope in [1, 0.2]:\n params = {\"pivot\": 10, \"slope\": slope}\n\n model_accuracy, doc_scores = get_tfidf_scores(params)\n\n if model_accuracy > best_model_accuracy:\n best_model_accuracy = model_accuracy\n optimum_slope = slope\n\n doc_scores, doc_leng = sort_length_by_score(doc_scores, X_test)\n\n y = abs(doc_scores[:k, np.newaxis])\n x = doc_leng[:k, np.newaxis]\n\n py.subplot(1, 2, it+1).bar(x, y, width=20, linewidth=0)\n py.title(f\"Slope = {slope} Model accuracy = {model_accuracy}\")\n py.ylim([0, 4.5])\n py.xlim([0, 3200])\n py.xlabel(\"document length\")\n py.ylabel(\"confidence score\")\n\n it += 1\n\npy.tight_layout()\npy.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above histogram plot helps us visualize the effect of ``slope``. For top\nk documents we have document length on the x axis and their respective scores\nof belonging to a specific class on y axis.\n\nAs we decrease the slope the density of bins is shifted from low document\nlength (around ~250-500) to over ~500 document length. This suggests that the\npositive biasness which was seen at ``slope=1`` (or when regular tfidf was\nused) for short documents is now reduced. We get the optimum slope or the max\nmodel accuracy when slope is 0.2.\n\nConclusion\n==========\n\nUsing pivoted document normalization improved the classification accuracy a little bit:\n\n* Before (slope=1, identical to default cosine normalization): 0.9682\n* After (slope=0.2): 0.9771\n\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py b/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py deleted file mode 100644 index 9570595a54..0000000000 --- a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py +++ /dev/null @@ -1,243 +0,0 @@ -r""" -Pivoted Document Length Normalization -===================================== - -This tutorial demonstrates using Pivoted Document Length Normalization to -counter the effect of short document bias when working with TfIdf, thereby -increasing classification accuracy. -""" - -############################################################################### -# In many cases, normalizing the tfidf weights for each term favors weights of terms -# of the documents with shorter length. The *pivoted document length normalization* scheme -# counters the effect of this bias for short documents, by making tfidf independent of the document length. -# -# This is achieved by *tilting* the normalization curve along a pivot point and slope, which -# must be defined by the user. -# -# Roughly following the equation: -# -# ``pivoted_norm = (1 - slope) * pivot + slope * old_norm`` -# -# This scheme is proposed in the paper `Pivoted Document Length Normalization `_ -# by Singhal, Buckley and Mitra. -# -# Overall this approach can increase the accuracy of the model where document lengths are hugely varying across the corpus. -# -# Introduction -# ------------ -# -# This guide demonstrates how to perform pivoted document length normalization. -# -# We will train a logistic regression model to distinguish between text from two different newsgroups. -# -# Our results will show that using pivoted document length normalization yields a better model (higher classification accuracy). -# - -# -# Download our dataset -# -import gensim.downloader as api -nws = api.load("20-newsgroups") - -# -# Pick texts from relevant newsgroups, split into training and test set. -# -cat1, cat2 = ('sci.electronics', 'sci.space') - -# -# X_* contain the actual texts as strings. -# Y_* contain labels, 0 for cat1 (sci.electronics) and 1 for cat2 (sci.space) -# -X_train = [] -X_test = [] -y_train = [] -y_test = [] - -for i in nws: - if i["set"] == "train" and i["topic"] == cat1: - X_train.append(i["data"]) - y_train.append(0) - elif i["set"] == "train" and i["topic"] == cat2: - X_train.append(i["data"]) - y_train.append(1) - elif i["set"] == "test" and i["topic"] == cat1: - X_test.append(i["data"]) - y_test.append(0) - elif i["set"] == "test" and i["topic"] == cat2: - X_test.append(i["data"]) - y_test.append(1) - -############################################################################### -# Preprocess the data -# -from gensim.parsing.preprocessing import preprocess_string -from gensim.corpora import Dictionary - -id2word = Dictionary([preprocess_string(doc) for doc in X_train]) -train_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_train] -test_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_test] - -print(len(X_train), len(X_test)) - -# We perform our analysis on top k documents which is almost top 10% most scored documents -k = len(X_test) // 10 - -############################################################################### -# Prepare our evaluation function -# -from gensim.sklearn_api.tfidf import TfIdfTransformer -from sklearn.linear_model import LogisticRegression -from gensim.matutils import corpus2csc - -def get_tfidf_scores(kwargs): - """ - Return a model's accuracy along with individual document probability values, using - Gensim's TfIdfTransformer and sklearn's LogisticRegression. - - """ - tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus) - - X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T - X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T - - clf = LogisticRegression().fit(X_train_tfidf, y_train) - - model_accuracy = clf.score(X_test_tfidf, y_test) - doc_scores = clf.decision_function(X_test_tfidf) - - return model_accuracy, doc_scores - -############################################################################### -# Get TFIDF scores for corpus without pivoted document length normalisation -# ------------------------------------------------------------------------- -# -params = {} -model_accuracy, doc_scores = get_tfidf_scores(params) -print(model_accuracy) - -############################################################################### -# Examine the bias towards shorter documents -import numpy as np - -# Sort the document scores by their scores and return a sorted list -# of document score and corresponding document lengths. -def sort_length_by_score(doc_scores, X_test): - doc_scores = sorted(enumerate(doc_scores), key=lambda x: x[1]) - doc_leng = np.empty(len(doc_scores)) - - ds = np.empty(len(doc_scores)) - - for i, _ in enumerate(doc_scores): - doc_leng[i] = len(X_test[_[0]]) - ds[i] = _[1] - - return ds, doc_leng - - -print( - f"Normal cosine normalisation favors short documents as our top {k} docs have a smaller " - f"mean doc length of {sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} " - f"compared to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" -) - -############################################################################### -# Get TFIDF scores for corpus with pivoted document length normalisation -# ---------------------------------------------------------------------- -# -# Test various values of alpha (slope) and pick the best one. -best_model_accuracy = 0 -optimum_slope = 0 -for slope in np.arange(0, 1.1, 0.1): - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - print(f"Score for slope {slope} is {model_accuracy}") - -print(f"We get best score of {best_model_accuracy} at slope {optimum_slope}") - -############################################################################### -# Evaluate the model with optimum slope -# -params = {"pivot": 10, "slope": optimum_slope} -model_accuracy, doc_scores = get_tfidf_scores(params) -print(model_accuracy) - -print( - f"With pivoted normalisation top {k} docs have a mean length of " - f"{sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} which is much " - f"closer to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" -) - -############################################################################### -# -# Visualizing the pivoted normalization -# ------------------------------------- -# -# From the plot we can see that when the slope was 1 (i.e. when pivoted normalisation -# was not applied at all), short documents with length of around 500 had very good scores. -# This is a bias for short documents. As we varied the value of slope from 1 to 0 -# we introdcued a new bias for long documents to counter the bias caused by -# cosine normalisation. At a certain point we got an optimum value of -# slope (0.5 here) where the overall accuracy of the model was maximized. -# -import matplotlib.pyplot as py - -best_model_accuracy = 0 -optimum_slope = 0 - -w = 2 -h = 2 -f, axarr = py.subplots(h, w, figsize=(15, 7)) - -it = 0 -for slope in [1, 0.2]: - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - doc_scores, doc_leng = sort_length_by_score(doc_scores, X_test) - - y = abs(doc_scores[:k, np.newaxis]) - x = doc_leng[:k, np.newaxis] - - py.subplot(1, 2, it+1).bar(x, y, width=20, linewidth=0) - py.title(f"Slope = {slope} Model accuracy = {model_accuracy}") - py.ylim([0, 4.5]) - py.xlim([0, 3200]) - py.xlabel("document length") - py.ylabel("confidence score") - - it += 1 - -py.tight_layout() -py.show() - -############################################################################### -# The above histogram plot helps us visualize the effect of ``slope``. For top -# k documents we have document length on the x axis and their respective scores -# of belonging to a specific class on y axis. -# -# As we decrease the slope the density of bins is shifted from low document -# length (around ~250-500) to over ~500 document length. This suggests that the -# positive biasness which was seen at ``slope=1`` (or when regular tfidf was -# used) for short documents is now reduced. We get the optimum slope or the max -# model accuracy when slope is 0.2. -# -# Conclusion -# ========== -# -# Using pivoted document normalization improved the classification accuracy a little bit: -# -# * Before (slope=1, identical to default cosine normalization): 0.9682 -# * After (slope=0.2): 0.9771 -# diff --git a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py.md5 b/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py.md5 deleted file mode 100644 index 25f0127c1d..0000000000 --- a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.py.md5 +++ /dev/null @@ -1 +0,0 @@ -d0a3444f7eea966f7061bdb33a451051 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.rst b/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.rst deleted file mode 100644 index 1939e9d1c2..0000000000 --- a/docs/src/auto_examples/tutorials/run_pivoted_doc_norm.rst +++ /dev/null @@ -1,439 +0,0 @@ -.. only:: html - - .. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title - - .. _sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py: - - -Pivoted Document Length Normalization -===================================== - -This tutorial demonstrates using Pivoted Document Length Normalization to -counter the effect of short document bias when working with TfIdf, thereby -increasing classification accuracy. - -In many cases, normalizing the tfidf weights for each term favors weights of terms -of the documents with shorter length. The *pivoted document length normalization* scheme -counters the effect of this bias for short documents, by making tfidf independent of the document length. - -This is achieved by *tilting* the normalization curve along a pivot point and slope, which -must be defined by the user. - -Roughly following the equation: - -``pivoted_norm = (1 - slope) * pivot + slope * old_norm`` - -This scheme is proposed in the paper `Pivoted Document Length Normalization `_ -by Singhal, Buckley and Mitra. - -Overall this approach can increase the accuracy of the model where document lengths are hugely varying across the corpus. - -Introduction ------------- - -This guide demonstrates how to perform pivoted document length normalization. - -We will train a logistic regression model to distinguish between text from two different newsgroups. - -Our results will show that using pivoted document length normalization yields a better model (higher classification accuracy). - - - -.. code-block:: default - - - # - # Download our dataset - # - import gensim.downloader as api - nws = api.load("20-newsgroups") - - # - # Pick texts from relevant newsgroups, split into training and test set. - # - cat1, cat2 = ('sci.electronics', 'sci.space') - - # - # X_* contain the actual texts as strings. - # Y_* contain labels, 0 for cat1 (sci.electronics) and 1 for cat2 (sci.space) - # - X_train = [] - X_test = [] - y_train = [] - y_test = [] - - for i in nws: - if i["set"] == "train" and i["topic"] == cat1: - X_train.append(i["data"]) - y_train.append(0) - elif i["set"] == "train" and i["topic"] == cat2: - X_train.append(i["data"]) - y_train.append(1) - elif i["set"] == "test" and i["topic"] == cat1: - X_test.append(i["data"]) - y_test.append(0) - elif i["set"] == "test" and i["topic"] == cat2: - X_test.append(i["data"]) - y_test.append(1) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - /Volumes/work/workspace/vew/gensim3.6/lib/python3.6/site-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function - 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL - - - - -Preprocess the data - - - -.. code-block:: default - - from gensim.parsing.preprocessing import preprocess_string - from gensim.corpora import Dictionary - - id2word = Dictionary([preprocess_string(doc) for doc in X_train]) - train_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_train] - test_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_test] - - print(len(X_train), len(X_test)) - - # We perform our analysis on top k documents which is almost top 10% most scored documents - k = len(X_test) // 10 - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 1184 787 - - - - -Prepare our evaluation function - - - -.. code-block:: default - - from gensim.sklearn_api.tfidf import TfIdfTransformer - from sklearn.linear_model import LogisticRegression - from gensim.matutils import corpus2csc - - def get_tfidf_scores(kwargs): - """ - Return a model's accuracy along with individual document probability values, using - Gensim's TfIdfTransformer and sklearn's LogisticRegression. - - """ - tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus) - - X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T - X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T - - clf = LogisticRegression().fit(X_train_tfidf, y_train) - - model_accuracy = clf.score(X_test_tfidf, y_test) - doc_scores = clf.decision_function(X_test_tfidf) - - return model_accuracy, doc_scores - - - - - - - - -Get TFIDF scores for corpus without pivoted document length normalisation -------------------------------------------------------------------------- - - - -.. code-block:: default - - params = {} - model_accuracy, doc_scores = get_tfidf_scores(params) - print(model_accuracy) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.9682337992376112 - - - - -Examine the bias towards shorter documents - - -.. code-block:: default - - import numpy as np - - # Sort the document scores by their scores and return a sorted list - # of document score and corresponding document lengths. - def sort_length_by_score(doc_scores, X_test): - doc_scores = sorted(enumerate(doc_scores), key=lambda x: x[1]) - doc_leng = np.empty(len(doc_scores)) - - ds = np.empty(len(doc_scores)) - - for i, _ in enumerate(doc_scores): - doc_leng[i] = len(X_test[_[0]]) - ds[i] = _[1] - - return ds, doc_leng - - - print( - f"Normal cosine normalisation favors short documents as our top {k} docs have a smaller " - f"mean doc length of {sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} " - f"compared to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" - ) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - Normal cosine normalisation favors short documents as our top 78 docs have a smaller mean doc length of 1668.179 compared to the corpus mean doc length of 1577.799 - - - - -Get TFIDF scores for corpus with pivoted document length normalisation ----------------------------------------------------------------------- - -Test various values of alpha (slope) and pick the best one. - - -.. code-block:: default - - best_model_accuracy = 0 - optimum_slope = 0 - for slope in np.arange(0, 1.1, 0.1): - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - print(f"Score for slope {slope} is {model_accuracy}") - - print(f"We get best score of {best_model_accuracy} at slope {optimum_slope}") - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - Score for slope 0.0 is 0.9720457433290979 - Score for slope 0.1 is 0.9758576874205845 - Score for slope 0.2 is 0.97712833545108 - Score for slope 0.30000000000000004 is 0.9783989834815756 - Score for slope 0.4 is 0.97712833545108 - Score for slope 0.5 is 0.9758576874205845 - Score for slope 0.6000000000000001 is 0.9733163913595934 - Score for slope 0.7000000000000001 is 0.9733163913595934 - Score for slope 0.8 is 0.9733163913595934 - Score for slope 0.9 is 0.9733163913595934 - Score for slope 1.0 is 0.9682337992376112 - We get best score of 0.9783989834815756 at slope 0.30000000000000004 - - - - -Evaluate the model with optimum slope - - - -.. code-block:: default - - params = {"pivot": 10, "slope": optimum_slope} - model_accuracy, doc_scores = get_tfidf_scores(params) - print(model_accuracy) - - print( - f"With pivoted normalisation top {k} docs have a mean length of " - f"{sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} which is much " - f"closer to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" - ) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 0.9783989834815756 - With pivoted normalisation top 78 docs have a mean length of 2077.346 which is much closer to the corpus mean doc length of 1577.799 - - - - -Visualizing the pivoted normalization -------------------------------------- - -From the plot we can see that when the slope was 1 (i.e. when pivoted normalisation -was not applied at all), short documents with length of around 500 had very good scores. -This is a bias for short documents. As we varied the value of slope from 1 to 0 -we introdcued a new bias for long documents to counter the bias caused by -cosine normalisation. At a certain point we got an optimum value of -slope (0.5 here) where the overall accuracy of the model was maximized. - - - -.. code-block:: default - - import matplotlib.pyplot as py - - best_model_accuracy = 0 - optimum_slope = 0 - - w = 2 - h = 2 - f, axarr = py.subplots(h, w, figsize=(15, 7)) - - it = 0 - for slope in [1, 0.2]: - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - doc_scores, doc_leng = sort_length_by_score(doc_scores, X_test) - - y = abs(doc_scores[:k, np.newaxis]) - x = doc_leng[:k, np.newaxis] - - py.subplot(1, 2, it+1).bar(x, y, width=20, linewidth=0) - py.title(f"Slope = {slope} Model accuracy = {model_accuracy}") - py.ylim([0, 4.5]) - py.xlim([0, 3200]) - py.xlabel("document length") - py.ylabel("confidence score") - - it += 1 - - py.tight_layout() - py.show() - - - - -.. image:: /auto_examples/tutorials/images/sphx_glr_run_pivoted_doc_norm_001.png - :alt: Slope = 1 Model accuracy = 0.9682337992376112, Slope = 0.2 Model accuracy = 0.97712833545108 - :class: sphx-glr-single-img - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - /Volumes/work/workspace/vew/gensim3.6/lib/python3.6/site-packages/matplotlib/figure.py:445: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure. - % get_backend()) - - - - -The above histogram plot helps us visualize the effect of ``slope``. For top -k documents we have document length on the x axis and their respective scores -of belonging to a specific class on y axis. - -As we decrease the slope the density of bins is shifted from low document -length (around ~250-500) to over ~500 document length. This suggests that the -positive biasness which was seen at ``slope=1`` (or when regular tfidf was -used) for short documents is now reduced. We get the optimum slope or the max -model accuracy when slope is 0.2. - -Conclusion -========== - -Using pivoted document normalization improved the classification accuracy a little bit: - -* Before (slope=1, identical to default cosine normalization): 0.9682 -* After (slope=0.2): 0.9771 - - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 27.237 seconds) - -**Estimated memory usage:** 44 MB - - -.. _sphx_glr_download_auto_examples_tutorials_run_pivoted_doc_norm.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python - - :download:`Download Python source code: run_pivoted_doc_norm.py ` - - - - .. container:: sphx-glr-download sphx-glr-download-jupyter - - :download:`Download Jupyter notebook: run_pivoted_doc_norm.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/src/auto_examples/tutorials/run_summarization.ipynb b/docs/src/auto_examples/tutorials/run_summarization.ipynb deleted file mode 100644 index ff900ee6eb..0000000000 --- a/docs/src/auto_examples/tutorials/run_summarization.ipynb +++ /dev/null @@ -1,331 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\nText Summarization\n==================\n\nDemonstrates summarizing text by extracting the most important sentences from it.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This module automatically summarizes the given text, by extracting one or\nmore important sentences from the text. In a similar way, it can also extract\nkeywords. This tutorial will teach you to use this summarization module via\nsome examples. First, we will try a small example, then we will try two\nlarger ones, and then we will review the performance of the summarizer in\nterms of speed.\n\nThis summarizer is based on the , from an `\"TextRank\" algorithm by Mihalcea\net al `_.\nThis algorithm was later improved upon by `Barrios et al.\n`_,\nby introducing something called a \"BM25 ranking function\". \n\n.. important::\n Gensim's summarization only works for English for now, because the text\n is pre-processed so that stopwords are removed and the words are stemmed,\n and these processes are language-dependent.\n\nSmall example\n-------------\n\nFirst of all, we import the :py:func:`gensim.summarization.summarize` function.\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from pprint import pprint as print\nfrom gensim.summarization import summarize" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "text = (\n \"Thomas A. Anderson is a man living two lives. By day he is an \"\n \"average computer programmer and by night a hacker known as \"\n \"Neo. Neo has always questioned his reality, but the truth is \"\n \"far beyond his imagination. Neo finds himself targeted by the \"\n \"police when he is contacted by Morpheus, a legendary computer \"\n \"hacker branded a terrorist by the government. Morpheus awakens \"\n \"Neo to the real world, a ravaged wasteland where most of \"\n \"humanity have been captured by a race of machines that live \"\n \"off of the humans' body heat and electrochemical energy and \"\n \"who imprison their minds within an artificial reality known as \"\n \"the Matrix. As a rebel against the machines, Neo must return to \"\n \"the Matrix and confront the agents: super-powerful computer \"\n \"programs devoted to snuffing out Neo and the entire human \"\n \"rebellion. \"\n)\nprint(text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To summarize this text, we pass the **raw string data** as input to the\nfunction \"summarize\", and it will return a summary.\n\nNote: make sure that the string does not contain any newlines where the line\nbreaks in a sentence. A sentence with a newline in it (i.e. a carriage\nreturn, \"\\n\") will be treated as two sentences.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(summarize(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the \"split\" option if you want a list of strings instead of a single string.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(summarize(text, split=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can adjust how much text the summarizer outputs via the \"ratio\" parameter\nor the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what\nfraction of sentences in the original text should be returned as output.\nBelow we specify that we want 50% of the original text (the default is 20%).\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(summarize(text, ratio=0.5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the \"word_count\" parameter, we specify the maximum amount of words we\nwant in the summary. Below we have specified that we want no more than 50\nwords.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(summarize(text, word_count=50))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned earlier, this module also supports **keyword** extraction.\nKeyword extraction works in the same way as summary generation (i.e. sentence\nextraction), in that the algorithm tries to find words that are important or\nseem representative of the entire text. They keywords are not always single\nwords; in the case of multi-word keywords, they are typically all nouns.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from gensim.summarization import keywords\nprint(keywords(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Larger example\n--------------\n\nLet us try an example with a larger piece of text. We will be using a\nsynopsis of the movie \"The Matrix\", which we have taken from `this\n`_ IMDb page.\n\nIn the code below, we read the text file directly from a web-page using\n\"requests\". Then we produce a summary and some keywords.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import requests\n\ntext = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\nprint(text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, the summary\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(summarize(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now, the keywords:\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you know this movie, you see that this summary is actually quite good. We\nalso see that some of the most important characters (Neo, Morpheus, Trinity)\nwere extracted as keywords.\n\nAnother example\n---------------\n\nLet's try an example similar to the one above. This time, we will use the IMDb synopsis\n`The Big Lebowski `_.\n\nAgain, we download the text and produce a summary and some keywords.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\nprint(text)\nprint(summarize(text, ratio=0.01))\nprint(keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time around, the summary is not of high quality, as it does not tell us\nmuch about the movie. In a way, this might not be the algorithms fault,\nrather this text simply doesn't contain one or two sentences that capture the\nessence of the text as in \"The Matrix\" synopsis.\n\nThe keywords, however, managed to find some of the main characters.\n\nPerformance\n-----------\n\nWe will test how the speed of the summarizer scales with the size of the\ndataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4\nprocessor. Note that the summarizer does **not** support multithreading\n(parallel processing).\n\nThe tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download\nthe book in plain-text `here `__.\n\nIn the **plot below** , we see the running times together with the sizes of\nthe datasets. To create datasets of different sizes, we have simply taken\nprefixes of text; in other words we take the first **n** characters of the\nbook. The algorithm seems to be **quadratic in time** , so one needs to be\ncareful before plugging a large dataset into the summarizer.\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\nimport matplotlib.image as mpimg\nimg = mpimg.imread('summarization_tutorial_plot.png')\nimgplot = plt.imshow(img)\nplt.axis('off')\nplt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Text-content dependent running times\n------------------------------------\n\nThe running time is not only dependent on the size of the dataset. For\nexample, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes\nabout 3.1 seconds, while summarizing 35,000 characters of this book takes\nabout 8.5 seconds. So the former is **more than twice as fast**.\n\nOne reason for this difference in running times is the data structure that is\nused. The algorithm represents the data using a graph, where vertices (nodes)\nare sentences, and then constructs weighted edges between the vertices that\nrepresent how the sentences relate to each other. This means that every piece\nof text will have a different graph, thus making the running times different.\nThe size of this data structure is **quadratic in the worst case** (the worst\ncase is when each vertex has an edge to every other vertex).\n\nAnother possible reason for the difference in running times is that the\nproblems converge at different rates, meaning that the error drops slower for\nsome datasets than for others.\n\nMontemurro and Zanette's entropy based keyword extraction algorithm\n-------------------------------------------------------------------\n\n`This paper `__ describes a technique to\nidentify words that play a significant role in the large-scale structure of a\ntext. These typically correspond to the major themes of the text. The text is\ndivided into blocks of ~1000 words, and the entropy of each word's\ndistribution amongst the blocks is caclulated and compared with the expected\nentropy if the word were distributed randomly.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import requests\nfrom gensim.summarization import mz_keywords\n\ntext=requests.get(\"http://www.gutenberg.org/files/49679/49679-0.txt\").text\nprint(mz_keywords(text,scores=True,threshold=0.001))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the algorithm weights the entropy by the overall frequency of the\nword in the document. We can remove this weighting by setting weighted=False\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(mz_keywords(text,scores=True,weighted=False,threshold=1.0))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When this option is used, it is possible to calculate a threshold\nautomatically from the number of blocks\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(mz_keywords(text,scores=True,weighted=False,threshold=\"auto\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The complexity of the algorithm is **O**\\ (\\ *Nw*\\ ), where *N* is the number\nof words in the document and *w* is the number of unique words.\n\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_summarization.py b/docs/src/auto_examples/tutorials/run_summarization.py deleted file mode 100644 index e5281e1a9b..0000000000 --- a/docs/src/auto_examples/tutorials/run_summarization.py +++ /dev/null @@ -1,243 +0,0 @@ -r""" -Text Summarization -================== - -Demonstrates summarizing text by extracting the most important sentences from it. - -""" -import logging -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -############################################################################### -# This module automatically summarizes the given text, by extracting one or -# more important sentences from the text. In a similar way, it can also extract -# keywords. This tutorial will teach you to use this summarization module via -# some examples. First, we will try a small example, then we will try two -# larger ones, and then we will review the performance of the summarizer in -# terms of speed. -# -# This summarizer is based on the , from an `"TextRank" algorithm by Mihalcea -# et al `_. -# This algorithm was later improved upon by `Barrios et al. -# `_, -# by introducing something called a "BM25 ranking function". -# -# .. important:: -# Gensim's summarization only works for English for now, because the text -# is pre-processed so that stopwords are removed and the words are stemmed, -# and these processes are language-dependent. -# -# Small example -# ------------- -# -# First of all, we import the :py:func:`gensim.summarization.summarize` function. - - -from pprint import pprint as print -from gensim.summarization import summarize - -############################################################################### -# We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example. -# - - -text = ( - "Thomas A. Anderson is a man living two lives. By day he is an " - "average computer programmer and by night a hacker known as " - "Neo. Neo has always questioned his reality, but the truth is " - "far beyond his imagination. Neo finds himself targeted by the " - "police when he is contacted by Morpheus, a legendary computer " - "hacker branded a terrorist by the government. Morpheus awakens " - "Neo to the real world, a ravaged wasteland where most of " - "humanity have been captured by a race of machines that live " - "off of the humans' body heat and electrochemical energy and " - "who imprison their minds within an artificial reality known as " - "the Matrix. As a rebel against the machines, Neo must return to " - "the Matrix and confront the agents: super-powerful computer " - "programs devoted to snuffing out Neo and the entire human " - "rebellion. " -) -print(text) - -############################################################################### -# To summarize this text, we pass the **raw string data** as input to the -# function "summarize", and it will return a summary. -# -# Note: make sure that the string does not contain any newlines where the line -# breaks in a sentence. A sentence with a newline in it (i.e. a carriage -# return, "\n") will be treated as two sentences. -# - -print(summarize(text)) - -############################################################################### -# -# Use the "split" option if you want a list of strings instead of a single string. -# -print(summarize(text, split=True)) - -############################################################################### -# -# You can adjust how much text the summarizer outputs via the "ratio" parameter -# or the "word_count" parameter. Using the "ratio" parameter, you specify what -# fraction of sentences in the original text should be returned as output. -# Below we specify that we want 50% of the original text (the default is 20%). -# - -print(summarize(text, ratio=0.5)) - -############################################################################### -# -# Using the "word_count" parameter, we specify the maximum amount of words we -# want in the summary. Below we have specified that we want no more than 50 -# words. -# -print(summarize(text, word_count=50)) - -############################################################################### -# As mentioned earlier, this module also supports **keyword** extraction. -# Keyword extraction works in the same way as summary generation (i.e. sentence -# extraction), in that the algorithm tries to find words that are important or -# seem representative of the entire text. They keywords are not always single -# words; in the case of multi-word keywords, they are typically all nouns. -# - -from gensim.summarization import keywords -print(keywords(text)) - -############################################################################### -# Larger example -# -------------- -# -# Let us try an example with a larger piece of text. We will be using a -# synopsis of the movie "The Matrix", which we have taken from `this -# `_ IMDb page. -# -# In the code below, we read the text file directly from a web-page using -# "requests". Then we produce a summary and some keywords. -# - - -import requests - -text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text -print(text) - -############################################################################### -# First, the summary -# -print(summarize(text, ratio=0.01)) - - -############################################################################### -# And now, the keywords: -# -print(keywords(text, ratio=0.01)) - -############################################################################### -# If you know this movie, you see that this summary is actually quite good. We -# also see that some of the most important characters (Neo, Morpheus, Trinity) -# were extracted as keywords. -# -# Another example -# --------------- -# -# Let's try an example similar to the one above. This time, we will use the IMDb synopsis -# `The Big Lebowski `_. -# -# Again, we download the text and produce a summary and some keywords. -# - - -text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text -print(text) -print(summarize(text, ratio=0.01)) -print(keywords(text, ratio=0.01)) - -############################################################################### -# This time around, the summary is not of high quality, as it does not tell us -# much about the movie. In a way, this might not be the algorithms fault, -# rather this text simply doesn't contain one or two sentences that capture the -# essence of the text as in "The Matrix" synopsis. -# -# The keywords, however, managed to find some of the main characters. -# -# Performance -# ----------- -# -# We will test how the speed of the summarizer scales with the size of the -# dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 -# processor. Note that the summarizer does **not** support multithreading -# (parallel processing). -# -# The tests were run on the book "Honest Abe" by Alonzo Rothschild. Download -# the book in plain-text `here `__. -# -# In the **plot below** , we see the running times together with the sizes of -# the datasets. To create datasets of different sizes, we have simply taken -# prefixes of text; in other words we take the first **n** characters of the -# book. The algorithm seems to be **quadratic in time** , so one needs to be -# careful before plugging a large dataset into the summarizer. - -import matplotlib.pyplot as plt -import matplotlib.image as mpimg -img = mpimg.imread('summarization_tutorial_plot.png') -imgplot = plt.imshow(img) -plt.axis('off') -plt.show() - -############################################################################### -# Text-content dependent running times -# ------------------------------------ -# -# The running time is not only dependent on the size of the dataset. For -# example, summarizing "The Matrix" synopsis (about 36,000 characters) takes -# about 3.1 seconds, while summarizing 35,000 characters of this book takes -# about 8.5 seconds. So the former is **more than twice as fast**. -# -# One reason for this difference in running times is the data structure that is -# used. The algorithm represents the data using a graph, where vertices (nodes) -# are sentences, and then constructs weighted edges between the vertices that -# represent how the sentences relate to each other. This means that every piece -# of text will have a different graph, thus making the running times different. -# The size of this data structure is **quadratic in the worst case** (the worst -# case is when each vertex has an edge to every other vertex). -# -# Another possible reason for the difference in running times is that the -# problems converge at different rates, meaning that the error drops slower for -# some datasets than for others. -# -# Montemurro and Zanette's entropy based keyword extraction algorithm -# ------------------------------------------------------------------- -# -# `This paper `__ describes a technique to -# identify words that play a significant role in the large-scale structure of a -# text. These typically correspond to the major themes of the text. The text is -# divided into blocks of ~1000 words, and the entropy of each word's -# distribution amongst the blocks is caclulated and compared with the expected -# entropy if the word were distributed randomly. -# - - -import requests -from gensim.summarization import mz_keywords - -text=requests.get("http://www.gutenberg.org/files/49679/49679-0.txt").text -print(mz_keywords(text,scores=True,threshold=0.001)) - -############################################################################### -# By default, the algorithm weights the entropy by the overall frequency of the -# word in the document. We can remove this weighting by setting weighted=False -# -print(mz_keywords(text,scores=True,weighted=False,threshold=1.0)) - -############################################################################### -# When this option is used, it is possible to calculate a threshold -# automatically from the number of blocks -# -print(mz_keywords(text,scores=True,weighted=False,threshold="auto")) - -############################################################################### -# The complexity of the algorithm is **O**\ (\ *Nw*\ ), where *N* is the number -# of words in the document and *w* is the number of unique words. -# diff --git a/docs/src/auto_examples/tutorials/run_summarization.py.md5 b/docs/src/auto_examples/tutorials/run_summarization.py.md5 deleted file mode 100644 index 7d7a40b1f2..0000000000 --- a/docs/src/auto_examples/tutorials/run_summarization.py.md5 +++ /dev/null @@ -1 +0,0 @@ -fe6bd7ae71fe713de661c4fbf9b3b3b6 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_summarization.rst b/docs/src/auto_examples/tutorials/run_summarization.rst deleted file mode 100644 index e9c45a6953..0000000000 --- a/docs/src/auto_examples/tutorials/run_summarization.rst +++ /dev/null @@ -1,2359 +0,0 @@ -.. note:: - :class: sphx-glr-download-link-note - - Click :ref:`here ` to download the full example code -.. rst-class:: sphx-glr-example-title - -.. _sphx_glr_auto_examples_tutorials_run_summarization.py: - - -Text Summarization -================== - -Demonstrates summarizing text by extracting the most important sentences from it. - -.. code-block:: default - - import logging - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - - - - - - - -This module automatically summarizes the given text, by extracting one or -more important sentences from the text. In a similar way, it can also extract -keywords. This tutorial will teach you to use this summarization module via -some examples. First, we will try a small example, then we will try two -larger ones, and then we will review the performance of the summarizer in -terms of speed. - -This summarizer is based on the , from an `"TextRank" algorithm by Mihalcea -et al `_. -This algorithm was later improved upon by `Barrios et al. -`_, -by introducing something called a "BM25 ranking function". - -.. important:: - Gensim's summarization only works for English for now, because the text - is pre-processed so that stopwords are removed and the words are stemmed, - and these processes are language-dependent. - -Small example -------------- - -First of all, we import the :py:func:`gensim.summarization.summarize` function. - - -.. code-block:: default - - - - from pprint import pprint as print - from gensim.summarization import summarize - - - - - - - -We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example. - - - -.. code-block:: default - - - - text = ( - "Thomas A. Anderson is a man living two lives. By day he is an " - "average computer programmer and by night a hacker known as " - "Neo. Neo has always questioned his reality, but the truth is " - "far beyond his imagination. Neo finds himself targeted by the " - "police when he is contacted by Morpheus, a legendary computer " - "hacker branded a terrorist by the government. Morpheus awakens " - "Neo to the real world, a ravaged wasteland where most of " - "humanity have been captured by a race of machines that live " - "off of the humans' body heat and electrochemical energy and " - "who imprison their minds within an artificial reality known as " - "the Matrix. As a rebel against the machines, Neo must return to " - "the Matrix and confront the agents: super-powerful computer " - "programs devoted to snuffing out Neo and the entire human " - "rebellion. " - ) - print(text) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('Thomas A. Anderson is a man living two lives. By day he is an average ' - 'computer programmer and by night a hacker known as Neo. Neo has always ' - 'questioned his reality, but the truth is far beyond his imagination. Neo ' - 'finds himself targeted by the police when he is contacted by Morpheus, a ' - 'legendary computer hacker branded a terrorist by the government. Morpheus ' - 'awakens Neo to the real world, a ravaged wasteland where most of humanity ' - "have been captured by a race of machines that live off of the humans' body " - 'heat and electrochemical energy and who imprison their minds within an ' - 'artificial reality known as the Matrix. As a rebel against the machines, Neo ' - 'must return to the Matrix and confront the agents: super-powerful computer ' - 'programs devoted to snuffing out Neo and the entire human rebellion. ') - - -To summarize this text, we pass the **raw string data** as input to the -function "summarize", and it will return a summary. - -Note: make sure that the string does not contain any newlines where the line -breaks in a sentence. A sentence with a newline in it (i.e. a carriage -return, "\n") will be treated as two sentences. - - - -.. code-block:: default - - - print(summarize(text)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('Morpheus awakens Neo to the real world, a ravaged wasteland where most of ' - 'humanity have been captured by a race of machines that live off of the ' - "humans' body heat and electrochemical energy and who imprison their minds " - 'within an artificial reality known as the Matrix.') - - -Use the "split" option if you want a list of strings instead of a single string. - - - -.. code-block:: default - - print(summarize(text, split=True)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ['Morpheus awakens Neo to the real world, a ravaged wasteland where most of ' - 'humanity have been captured by a race of machines that live off of the ' - "humans' body heat and electrochemical energy and who imprison their minds " - 'within an artificial reality known as the Matrix.'] - - -You can adjust how much text the summarizer outputs via the "ratio" parameter -or the "word_count" parameter. Using the "ratio" parameter, you specify what -fraction of sentences in the original text should be returned as output. -Below we specify that we want 50% of the original text (the default is 20%). - - - -.. code-block:: default - - - print(summarize(text, ratio=0.5)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('By day he is an average computer programmer and by night a hacker known as ' - 'Neo. Neo has always questioned his reality, but the truth is far beyond his ' - 'imagination.\n' - 'Morpheus awakens Neo to the real world, a ravaged wasteland where most of ' - 'humanity have been captured by a race of machines that live off of the ' - "humans' body heat and electrochemical energy and who imprison their minds " - 'within an artificial reality known as the Matrix.\n' - 'As a rebel against the machines, Neo must return to the Matrix and confront ' - 'the agents: super-powerful computer programs devoted to snuffing out Neo and ' - 'the entire human rebellion.') - - -Using the "word_count" parameter, we specify the maximum amount of words we -want in the summary. Below we have specified that we want no more than 50 -words. - - - -.. code-block:: default - - print(summarize(text, word_count=50)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('Morpheus awakens Neo to the real world, a ravaged wasteland where most of ' - 'humanity have been captured by a race of machines that live off of the ' - "humans' body heat and electrochemical energy and who imprison their minds " - 'within an artificial reality known as the Matrix.') - - -As mentioned earlier, this module also supports **keyword** extraction. -Keyword extraction works in the same way as summary generation (i.e. sentence -extraction), in that the algorithm tries to find words that are important or -seem representative of the entire text. They keywords are not always single -words; in the case of multi-word keywords, they are typically all nouns. - - - -.. code-block:: default - - - from gensim.summarization import keywords - print(keywords(text)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 'neo\nhumanity\nhuman\nhumans body\nsuper\nreality\nhacker' - - -Larger example --------------- - -Let us try an example with a larger piece of text. We will be using a -synopsis of the movie "The Matrix", which we have taken from `this -`_ IMDb page. - -In the code below, we read the text file directly from a web-page using -"requests". Then we produce a summary and some keywords. - - - -.. code-block:: default - - - - import requests - - text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text - print(text) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('The screen is filled with green, cascading code which gives way to the ' - 'title, The Matrix.\r\n' - '\r\n' - 'A phone rings and text appears on the screen: "Call trans opt: received. ' - '2-19-98 13:24:18 REC: Log>" As a conversation takes place between Trinity ' - '(Carrie-Anne Moss) and Cypher (Joe Pantoliano), two free humans, a table of ' - 'random green numbers are being scanned and individual numbers selected, ' - 'creating a series of digits not unlike an ordinary phone number, as if a ' - 'code is being deciphered or a call is being traced.\r\n' - '\r\n' - 'Trinity discusses some unknown person. Cypher taunts Trinity, suggesting she ' - 'enjoys watching him. Trinity counters that "Morpheus (Laurence Fishburne) ' - 'says he may be \'the One\'," just as the sound of a number being selected ' - 'alerts Trinity that someone may be tracing their call. She ends the call.\r\n' - '\r\n' - "Armed policemen move down a darkened, decrepit hallway in the Heart O' the " - 'City Hotel, their flashlight beam bouncing just ahead of them. They come to ' - 'room 303, kick down the door and find a woman dressed in black, facing away ' - "from them. It's Trinity. She brings her hands up from the laptop she's " - 'working on at their command.\r\n' - '\r\n' - 'Outside the hotel a car drives up and three agents appear in neatly pressed ' - 'black suits. They are Agent Smith (Hugo Weaving), Agent Brown (Paul ' - 'Goddard), and Agent Jones (Robert Taylor). Agent Smith and the presiding ' - 'police lieutenant argue. Agent Smith admonishes the policeman that they were ' - 'given specific orders to contact the agents first, for their protection. The ' - 'lieutenant dismisses this and says that they can handle "one little girl" ' - 'and that he has two units that are bringing her down at that very moment. ' - 'Agent Smith replies: "No, Lieutenant. Your men are already dead."\r\n' - '\r\n' - 'Inside, Trinity easily defeats the six policemen sent to apprehend her, ' - 'using fighting and evasion techniques that seem to defy gravity. She calls ' - "Morpheus, letting him know that the line has been traced, though she doesn't " - 'know how. Morpheus informs her that she will have to "make it to another ' - 'exit," and that Agents are heading up after her.\r\n' - '\r\n' - 'A fierce rooftop chase ensues with Trinity and an Agent leaping from one ' - 'building to the next, astonishing the policemen left behind. Trinity makes a ' - 'daring leap across an alley and through a small window. She has momentarily ' - 'lost her pursuers and makes it to a public phone booth on the street level. ' - 'The phone begins to ring. As she approaches it a garbage truck, driven by ' - 'Agent Smith, careens towards the phone booth. Trinity makes a desperate dash ' - 'to the phone, picking it up just moments before the truck smashes the booth ' - 'into a brick wall. The three Agents reunite at the front of the truck. There ' - 'is no body in the wreckage. "She got out," one says. The other says, "The ' - 'informant is real." "We have the name of their next target," says the other, ' - '"His name is Neo."\r\n' - '\r\n' - 'Neo (Keanu Reeves), a hacker with thick black hair and a sallow appearance, ' - 'is asleep at his monitor. Notices about a manhunt for a man named Morpheus ' - "scroll across his screen as he sleeps. Suddenly Neo's screen goes blank and " - 'a series of text messages appear: "Wake up, Neo." "The Matrix has you." ' - '"Follow the White Rabbit." Then, the text says "Knock, knock, Neo..." just ' - "as he reads it, a knock comes at the door of his apartment, 101. It's a " - 'group of ravers and Neo gives them a contraband disc he has secreted in a ' - 'copy of Simulacra and Simulation. The lead raver asks him to join them and ' - 'Neo demurs until he sees the tattoo of a small white rabbit on the shoulder ' - 'of a seductive girl in the group.\r\n' - '\r\n' - "At a rave bar Neo stands alone and aloof as the group he's with continue " - 'partying. Trinity approaches him and introduces herself. Neo recognizes her ' - 'name; she was a famous hacker and had cracked the IRS database. She tells ' - 'him that he is in great danger, that they are watching him and that she ' - 'knows that he is searching for answers, particularly to the most important ' - 'question of all: what is the Matrix? The pulsing music of the bar gives way ' - "to the repetitious blare of Neo's alarm clock; it's 9:18 and he's late for " - 'work.\r\n' - '\r\n' - 'At his job at Metacortex, a leading software company housed in an ominous ' - 'high rise, Neo is berated by his boss for having a problem with authority, ' - "for thinking he's special. Neo listens to his boss, but his attention is on " - 'the persons cleaning the window of the office. Back at his bleak cubicle Neo ' - 'receives a delivery as "Thomas Anderson." Upon opening the package he finds ' - 'a cellphone which immediately rings. On the other end is Morpheus, who ' - 'informs Neo that they\'ve both run out of time and that "they" are coming ' - 'for him. Morpheus tells him to slowly look up, toward the elevator. Agents ' - 'Smith, Jones, and Brown are there, obviously looking for him, as a woman ' - "points towards Neo's cube. Morpheus tries to guide Neo out of the building " - 'but when he is instructed to get on a scaffolding and take it to the roof ' - "Neo rejects Morpheus's advice, allowing himself to be taken by the " - 'Agents.\r\n' - '\r\n' - "In an interrogation room the Agents confront Neo. They've had their eye on " - 'him for some time. He lives a dual existence: one life as Thomas A. ' - 'Anderson, a software engineer for a Metacortex, the other life as Neo, a ' - 'computer hacker "guilty of virtually every computer crime we have a law ' - 'for." Agent Smith asks him to help them capture Morpheus, a dangerous ' - 'terrorist, in exchange for amnesty. Neo gives them the finger and asks for ' - "his phone call. Mr. Smith asks what good is a phone call if he's unable to " - 'speak. Neo finds that his lips have fused together. Panicked, he is thrown ' - 'on the interrogation table by the Agents and they implant a shrimp-like ' - 'probe, a bug, in his stomach, entering through his belly-button.\r\n' - '\r\n' - 'Neo awakens with a start in his own bed, assuming it has all been a bad ' - 'dream. His phone rings and Morpheus is on the other line. He tells Neo that ' - "the line is tapped but they've underestimated his importance. Morpheus tells " - 'Neo he is the One and to meet him at the Adams St. bridge. There he is ' - 'picked up by Trinity and two others in a car; they all wear black latex and ' - 'leather. A woman in the front seat, Switch (Belinda McClory), pulls a gun on ' - "him and tells him to take off his shirt. Trinity tells him it's for their " - 'mutual protection and that he has to trust her. He takes off his shirt and ' - 'she uses a device to remove the probe that Neo believed had been part of a ' - 'nightmare. Trinity drops the bug out into the road where it slowly goes dark ' - 'in the rain.\r\n' - '\r\n' - "Trinity takes Neo to Morpheus. Morpheus explains that he's been searching " - 'for Neo his entire life and asks if Neo feels like "Alice in Wonderland, ' - 'falling down the rabbit hole." He explains to Neo that they exist in the ' - 'Matrix, a false reality that has been constructed for humans to hide the ' - 'truth. The truth is that everyone in the world is a slave, born into ' - 'bondage. Morpheus holds out two pills. In his left palm is a blue pill. If ' - 'Neo takes it he will wake up in his bed and "believe whatever you want to ' - 'believe." But if he takes the red pill in Morpheus\'s right hand, then "you ' - 'stay in Wonderland and I show you how deep the rabbit hole goes." Neo takes ' - 'the red pill.\r\n' - '\r\n' - "As the rest of Morpheus's crew straps him into a chair, Neo is told that " - 'pill he took is part of a trace program, to "disrupt his input/output ' - 'carrier signal" so that they can pinpoint him. Neo looks at a shattered ' - 'mirror placed next to him which miraculously reforms itself. Neo touches the ' - 'surface and the silver begins to creep over his skin, engulfing him as ' - "Morpheus's crew attempt to locate something on the monitors around them. The " - 'silver takes Neo over and he blacks out.\r\n' - '\r\n' - 'He awakens inside a pinkish/purple embryonic pod, extending from the side of ' - 'a circular building, a massive power plant. He is hairless and naked, with ' - 'thick black tubes snaking down his throat, plugged into the back of his ' - 'skull, his spine, and invading most of the rest of his body. He finds his ' - 'pod is open and that he is surrounded by tower after tower of pods just like ' - 'his, all filled with bodies. Suddenly a menacing, hovering nurse robot grabs ' - 'him by the throat. The tubes detach and Neo is flushed down a tube into an ' - "underground pool of filthy water. Just as he's about to drown in the muck a " - 'hovercraft appears above him, snags him and hauls him into its cargo bay. ' - "Neo finds himself surrounded by Morpheus's crew again, but they are dressed " - 'differently, in simple knit garments. Just before Neo passes out Morpheus ' - 'says to him, "Welcome to the real world."\r\n' - '\r\n' - 'Neo drifts in and out of consciousness. At one point he asks, "Am I dead?" ' - '"Far from it," replies Morpheus. Again he wakes, his body a pincushion of ' - 'acupuncture. "Why do my eyes hurt?" he asks. "You\'ve never used them," ' - 'Morpheus replies.\r\n' - '\r\n' - 'Neo finally wakes, fully clothed, with a short shock of hair on his head. He ' - 'removes a connector that is sunk deep into his arm and reaches to find the ' - 'large socket at the back of his neck when Morpheus enters the room. "What is ' - 'this place?" Neo asks. "The more important question is when," says Morpheus, ' - '"You believe it is the year 1999, when in fact it is closer to the year ' - '2199." Morpheus goes on to say that they really don\'t know when it is. He ' - 'gives Neo a tour of his ship, the Nebuchadnezzar (they pass a plaque stating ' - "it was built in 2069). Neo is introduced to Morpheus's crew including " - 'Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; ' - 'Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus ' - 'Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse ' - '(Matt Doran).\r\n' - '\r\n' - 'Morpheus gets to the point. "You wanted to know about the Matrix," he says, ' - 'ushering him to a chair. Neo sits down in it and Trinity straps him in. A ' - "long probe is inserted into the socket at the back of Neo's skull.\r\n" - '\r\n' - 'Neo wakes in a world of all white. He is in the Construct, a "loading ' - 'platform" that Morpheus and his team use to prepare newly freed humans to ' - "deal with the Matrix world. Gone are the sockets in Neo's arms and neck. He " - 'has hair again. Morpheus tells him that what he is experiencing of himself ' - 'is the "residual self image, the mental projection of your digital self" and ' - 'bids him to sit while he explains the truth. "This," he says, showing an ' - 'image of a modern city, "is the world that you know." A thing that really ' - 'exists "only as part of a neural, interactive simulation that we call the ' - 'Matrix."\r\n' - '\r\n' - 'Morpheus then shows Neo the world as it truly exists today, a scarred, ' - 'desolate emptiness with charred, abandoned buildings, black earth, and a ' - 'shrouded sky.\r\n' - '\r\n' - 'Morpheus goes on to say that "at some point in the early 21st century all of ' - 'mankind was united in celebration as we gave birth" to artificial ' - 'intelligence, a "singular consciousness that birthed an entire race of ' - 'machines."\r\n' - '\r\n' - 'Someone started a war, and no one knows who, but it was known that it was ' - 'mankind who blotted out the sky, attempting to deprive the machines of the ' - 'solar power they required to function. Instead the machines turned to humans ' - 'as a power source; Mopheus explains that a human\'s body provides "more ' - 'electricity than a 120 volt battery and over 25k BTUs in body heat." ' - 'Morpheus shows Neo fields where machines grow human beings, connecting them ' - 'to their outlets, ensconcing them in their pods, and feeding them with the ' - 'liquefied remains of other human beings. "The Matrix," says Morpheus, "is a ' - 'computer-generated dreamworld created to keep us under control, to turn ' - 'us..." into a mere power source, into coppertop batteries.\r\n' - '\r\n' - 'Neo rejects this information so feverishly that he pulls himself out of the ' - 'Construct. He is back in the chair on the hovercraft. He fights to free ' - 'himself from this harsh reality, only to end up vomiting on the floor and ' - 'passing out.\r\n' - '\r\n' - 'When Neo wakes up in his bunk, Morpheus is beside him. "I can\'t go back, ' - 'can I?" Neo asks. "No," says Morpheus. He apologizes to Neo for breaking a ' - "cardinal rule: after a certain age people aren't brought out of their " - 'simulacrum, but Morpheus explains he had to bring Neo out. When the Matrix ' - 'was created there was a man born inside it who could create his own reality ' - 'inside it. It was this man who set Morpheus and the others free. When he ' - 'died, the Oracle (Gloria Foster) prophesied that he would return in another ' - 'form. And that the return of the One would mean the destruction of the ' - 'Matrix. As long as the Matrix exists, humanity will continue to live in ' - 'complacency inside it and the world can never be free. "I did what I did ' - 'because I believe that search is over," says Morpheus.\r\n' - '\r\n' - 'The next day Neo starts his training. Tank is his operator. Tank and his ' - 'brother Dozer are "100% pure old-fashioned, homegrown human. Born in the ' - 'real world; a genuine child of Zion." Zion, Tank explains, is the last human ' - 'city, buried deep in the earth, near the core, for warmth. Tank straps Neo ' - 'back into the jack-in chair, by-passes some preliminary programs and loads ' - 'him up with combat training, starting with Jiu Jitsu. When Tank hits "load" ' - 'Neo is shocked by the force of the knowledge pouring into him. "I think he ' - 'likes it," says Tank, "want some more?" "Hell yes," replies Neo. Neo is fed ' - 'a series of martial arts techniques including Kempo, Tae Kwon Do, Drunken ' - "Boxing and Kung Fu. Morpheus and Tank are amazed at Neo's ability to ingest " - 'information, but Morpheus wants to test Neo.\r\n' - '\r\n' - 'Morpheus and Neo stand in a sparring program. The program has rules, like ' - 'gravity. But as in many computer programs, some rules can be bent while ' - 'others can be broken. Morpheus bids Neo to hit him, if he can. They fight ' - 'with Neo impressively attacking but Morpheus easily parrying and subduing ' - 'him. The rest of the crew gathers around the monitors to watch the fight. ' - 'Morpheus ends up kicking Neo into a beam, explaining to him that the reason ' - 'he has beaten him has nothing to do with muscles or reality. They spar ' - 'again. "What are you waiting for?" Morpheus asks him. "You\'re faster than ' - 'this!" Neo finally brings a punch near his teacher\'s face. They can move ' - 'on.\r\n' - '\r\n' - 'A jump program is loaded. Both men now stand on one of several tall ' - 'buildings in a normal city skyline. Morpheus tells Neo he must free his mind ' - 'and leaps from one building to the next. Neo nervously tries to follow him ' - "and doesn't make the jump, falling to the pavement below. Neo wakes back in " - 'the Nebudchanezzar with blood in his mouth. "I thought it wasn\'t real," he ' - 'says. "Your mind makes it real," replies Morpheus. "So, if you die in the ' - 'Matrix, you die here?" "The body cannot live without the mind," says ' - 'Morpheus, underlining the very real danger faced in the simulation.\r\n' - '\r\n' - 'Later, Trinity brings Neo dinner. Outside his room, Cypher remarks that ' - 'Trinity never brought him dinner. He asks Trinity why, if Morpheus thinks ' - "Neo is the One, he hasn't taken him to see the Oracle yet. Trinity says " - "he'll take him when he's ready.\r\n" - '\r\n' - 'Morpheus and Neo are walking down a standard city street in what appears to ' - 'be the Matrix. Morpheus explains that the Matrix is a system and that the ' - 'system is their enemy. All the people that inhabit it, the people they are ' - 'trying to free, are part of that system. Some are so inert, so dependent ' - 'upon the Matrix that they can never be free. Neo notices a stunning girl in ' - 'a red dress. "Are you listening to me?" asks Morpheus. He asks Neo to look ' - 'at the girl again. Neo turns to face Agent Smith, pointing a gun straight at ' - 'his head. Morpheus stops the simulation, which has just been created to look ' - 'like the Matrix.\r\n' - '\r\n' - 'Neo asks what the Agents are. "Sentient programs," says Morpheus, that "can ' - 'move in and out of any software hard-wired into their system, meaning that ' - 'they can take over anyone in the Matrix program. "Inside the Matrix," ' - 'Morpheus says, "They are everyone and they are no one." Thus Morpheus and ' - 'his crew survive the Agents by running from them and hiding from the Agents ' - 'even though they "are guarding all the doors. They are holding all the keys ' - 'and sooner or later, someone is going to have to fight them." But no one who ' - 'has ever stood up to an Agent has survived; all have died. Still, Morpheus ' - 'is certain that because the Agents live in a world of rules that they can ' - 'never be as strong, never be as fast as he can be. "What are you trying to ' - 'tell me," asks Neo, "That I can dodge bullets?" "When you\'re ready," ' - 'Morpheus says, "You won\'t have to." Just then Morpheus gets a phone call. ' - '"We\'ve got trouble," Cypher says on the other line.\r\n' - '\r\n' - 'The Nebuchadnezzar is on alert. They see the holographic image of a squiddy, ' - 'a search and destroy sentinel, which is on their trail. They set the ship ' - 'down in a huge sewer system and turn off the power. Tank stands at the ready ' - 'switch of an EMP, electro-magnetic pulse, the only weapon man has against ' - 'the machines in the real world. Two squiddies search for the ship -- the ' - 'crew can see them -- but they move on.\r\n' - '\r\n' - 'Neo startles Cypher, who is working at a computer console streaming with ' - 'green code. Cypher offers Neo a drink and says that he knows what Neo is ' - 'thinking, "Why, oh why didn\'t I take the blue pill?" Neo laughs but is ' - "unsettled. Cypher asks Neo if Morpheus has told him why he's here. Neo nods. " - '"What a mind job," says Cypher, "so you\'re here to save the world."\r\n' - '\r\n' - 'Cypher is now in a fancy restaurant with Agent Smith in the Matrix. Agent ' - 'Smith asks if they have a deal. Cypher cuts up a juicy steak and ruminates ' - 'that he knows the steak is merely the simulation telling his brain that it ' - 'is delicious and juicy, but after nine years he has discovered that ' - '"ignorance is bliss." He strikes a deal for the machines to reinsert his ' - "body into a power plant, reinsert him into the Matrix, and he'll help the " - 'Agents. He wants to be rich and powerful, "an actor" maybe. Smith says he ' - "wants access codes to the mainframe in Zion. Cypher says he can't do that, " - 'but that he can get him the man who does, meaning Morpheus.\r\n' - '\r\n' - "Meanwhile, inside the Nebuchadnezzar's small dining room in the real world, " - 'the rest of the crew is trying to choke down the oatmeal-gruel that they ' - 'have as sustenance. Mouse muses on the mistakes the machines may have made ' - "trying to get sensations right, like the taste of chicken. Since they didn't " - 'know what it tasted like they let everything taste like it. Morpheus ' - "interrupts the meal, announcing that he's taking Neo to see the Oracle.\r\n" - '\r\n' - 'Morpheus, Trinity, Neo, Apoc, Switch, Mouse and Cypher are jacked into the ' - 'Matrix. As they walk out of a warehouse Cypher secretly throws his cell ' - 'phone into the garbage. On the car ride to the Oracle, Neo asks Trinity if ' - "she has seen the Oracle. Trinity says that she has but when she's asked just " - 'what she was told by the Oracle, she refuses to answer.\r\n' - '\r\n' - 'The Oracle, Morpheus explains, has been with them since the beginning of the ' - 'Resistance. She is the one who made the Prophecy of the One and that ' - 'Morpheus would be the one to find him. She can help Neo find the path, he ' - 'says. He enters the apartment of the Oracle. Inside are the other ' - 'potentials: a mother figure and numerous children. One child levitates ' - 'blocks, one reads Asian literature, another is playing chess. One bald child ' - 'is bending spoons. He gives one spoon to Neo and says, "Do not try and bend ' - "the spoon, that's impossible. Instead, only try to realize the truth...that " - 'there is no spoon." Neo bends the spoon as he\'s called in to see the ' - 'Oracle.\r\n' - '\r\n' - 'The Oracle is baking cookies. She sizes Neo up and asks him whether he ' - 'thinks he is the One. Neo admits that he does not know and the Oracle does ' - 'not enlighten him. Neo smiles and the Oracle asks him what is funny. Neo ' - 'admits that Morpheus had almost convinced him that he was the One. She ' - 'accepts this and prophesies that Morpheus believes in Neo so much that he ' - 'plans to sacrifice himself. She tells Neo that either he or Morpheus will ' - 'die, and that Neo will have the power to choose which one it will be. She ' - 'then offers him a cookie and promises him that he will feel fine as soon as ' - "he's done eating it.\r\n" - '\r\n' - 'As the crew returns to their jack point, many floors up in an old hotel, ' - 'Tank, in the control room, notices something odd. Meanwhile Neo, walking up ' - 'the stairs, sees what appears to be the same cat cross a room twice. "Deja ' - 'vu," he says, which gets the attention of Trinity and Morpheus. Deja vu, ' - 'they explain to him, is a glitch in the Matrix; it happens when they reset ' - 'the computer parameters. Outside, the phone line is cut. Mouse runs to a ' - 'window which has now been bricked in. They are trapped. Mouse picks up two ' - "machine guns but he's no match for the police coming into the room. He's " - 'riddled with bullets.\r\n' - '\r\n' - 'Back on the Nebuchadnezzar, the real Mouse spurts blood from his mouth and ' - 'dies in the chair.\r\n' - '\r\n' - 'More police and Agents stream into the bottom of the hotel. Morpheus has ' - "Tank find a layout of the building they're in, locating the main wet wall. " - "The Agents arrive on the floor they're on, finding a coat that Cypher has " - 'left behind. They only find a hole in the bathroom wall. Meanwhile the crew ' - 'is climbing down the plumbing of the wet wall. As the police approach Cypher ' - 'sneezes, once more giving them away. The police open fire. The crew, ' - 'including Neo, begin to fire back.\r\n' - '\r\n' - 'An Agent takes over the body of one of the policemen, reaches into the wall, ' - 'and grabs Neo by the neck. Morpheus, who is above Neo in the walls, breaks ' - 'through the wall and lands on the agent, yelling to Trinity to get Neo out ' - 'of the building.\r\n' - '\r\n' - 'A fierce battle between Agent Smith and Morpheus ends with Morpheus face ' - 'down on the tile. Agent Smith sends the police unit in to beat him with ' - 'their batons.\r\n' - '\r\n' - 'Cypher returns to the Nebuchadnezzar before Trinity, Neo, Switch and Apoc. ' - 'As Tank attempts to bring the others back, Cypher attacks him from behind ' - 'with an electronic weapon. Dozer attempts to tackle Cypher, but Cypher ' - 'electrocutes him as well.\r\n' - '\r\n' - 'Trinity attempts to call Tank but Cypher pulls the headset off of the ' - 'smoking remains of Tank and answers. As Cypher talks to Trinity inside the ' - 'Matrix he leans over the still form of Trinity in the hovercraft. Cypher ' - 'recounts the things he hates about the real world, the war, the cold, the ' - 'goop they have to eat, but most especially Morpheus and his beliefs. "He ' - 'lied to us, Trinity."\r\n' - '\r\n' - "Cypher pulls the plug out of the back of Apoc's head, and Apoc falls down " - 'dead in the Matrix. Cypher then moves to Switch and as she protests "Not ' - 'like this..." in the Matrix, Cypher kills her on the ship. She falls down ' - "dead before Trinity and Neo. Cypher moves on to Neo's supine form, saying " - 'that if Neo is the One, a miracle will prevent Cypher from killing him:\r\n' - '\r\n' - '"How can he be the One, if he\'s dead?" he asks. He continues badgering ' - 'Trinity, asking her if she believes that Neo is the One. She says, "Yes." ' - 'Cypher screams back "No!" but his reaction is incredulity at seeing Tank ' - 'still alive, brandishing the weapon that Cypher had used on him. Tank fries ' - 'Cypher with the electrical device.\r\n' - '\r\n' - 'Tank brings Trinity back and she finds out that Dozer is dead.\r\n' - '\r\n' - 'Meanwhile Agent Smith, a tray of torture instruments near him, marvels at ' - 'the beauty of the Matrix as he gazes out at the city all around them. He ' - 'informs Morpheus, who is tied to a chair, that the first Matrix was designed ' - 'as a utopia, engineered to make everyone happy. "It was a disaster," says ' - 'Agent Smith, people wouldn\'t accept the program and "entire crops were ' - 'lost." "Some believed," continues Smith, "that we lacked the programming ' - 'language to describe your perfect world. But I believe that, as a species, ' - 'human beings define their reality through misery and suffering. The perfect ' - 'world was a dream that your primitive cerebrum kept trying to wake up from. ' - 'Which is why the Matrix was redesigned." Agent Smith compares humans to ' - 'dinosaurs and that evolution is taking hold. Another Agent enters and relays ' - 'that there may be a problem (as they now know that Cypher has failed).\r\n' - '\r\n' - 'Back on the hovercraft the shuddering form of Morpheus betrays the torture ' - "he's being put through by the Agents in the Matrix. Tank realizes that " - "they're trying to get the codes to the mainframes of Zion's computers; each " - "ship's captain knows them. Because a breach of Zion's defenses would mean " - 'that the last remaining vestiges of mankind would be wiped out, Tank says ' - 'their only choice is to unplug Morpheus, effectively killing him.\r\n' - '\r\n' - 'Back in the Matrix, the Agents process their next move. If Cypher is dead, ' - 'they deduce that the remaining humans on the ship will terminate Morpheus. ' - 'They decide to stick to their original plan and to deploy the Sentinels.\r\n' - '\r\n' - 'Tank is performing what amounts to last rites for Morpheus, laying one hand ' - 'on his head as his other moves to the back of his skull to remove the jack. ' - "Just as he's about to pull it out Neo stops him. He realizes that the Oracle " - 'was right. He now has to make the choice to save himself or to save ' - 'Morpheus; his choice is to head back into the Matrix. Trinity rejects the ' - 'idea. Morpheus gave himself up so that Neo could be saved since he is the ' - 'One.\r\n' - '\r\n' - '"I\'m not the One, Trinity," Neo says, relaying his understanding of the ' - 'discussion with the Oracle: she did not enlighten him as to whether he was ' - 'the promised messiah. And, since Morpheus was willing to sacrifice himself, ' - "Neo knows that he must do that same. Tank calls it suicide; it's a military " - 'building with Agents inside. Neo says he only knows that he can bring ' - 'Morpheus out. Trinity decides to come with him, reasoning with Neo that he ' - 'will need her help and she\'s the ranking officer on the ship. "Tank," she ' - 'says, "load us up!"\r\n' - '\r\n' - 'Meanwhile Agent Smith continues to share his musings with a brutalized ' - 'Morpheus. Because humans spread to an area, consume the natural resources ' - 'and, to survive, must spread to another area, Smith says we are not mammals ' - 'but viruses, the only other creature that acts that way.\r\n' - '\r\n' - 'In the Construct, Neo and Trinity get armaments. "Neo," protests Trinity, ' - '"No one has ever done anything like this." "That\'s why it\'s going to ' - 'work," he replies.\r\n' - '\r\n' - 'Morpheus has yet to break and Smith asks the other Agents why the serum ' - 'isn\'t working. "Maybe we\'re asking the wrong questions," responds one. To ' - 'that Smith commands the other Agents to leave him alone with Morpheus. Smith ' - 'removes his earphone and his glasses and confides that he hates the Matrix, ' - '"this zoo, this prison." Smith admits that he must get out of this ' - '"reality." He hates the stench. He\'s sure that some element of the humans ' - 'will rub off on him and that Morpheus holds the key to his release. If there ' - 'is no Zion there\'s no need for Smith to be in the Matrix. "You are going to ' - 'tell me, or you are going to die."\r\n' - '\r\n' - 'Downstairs, in the lobby, Trinity and Neo enter, heavily armed. They shoot ' - 'their way past the guards and a group of soldiers and make their way into ' - 'the elevator.\r\n' - '\r\n' - 'Agents Brown and Jones enter the interrogation room to find Smith with his ' - "hands still fixed on Morpheus's head. Smith looks embarrassed and befuddled " - 'and the others tell him about the attack occurring downstairs. They realize ' - 'that the humans are trying to save Morpheus.\r\n' - '\r\n' - 'In the elevator, Trinity arms a bomb. They both climb through a hatch to the ' - 'elevator roof, attaching a clamp to the elevator cable. Neo says "There is ' - 'no spoon" before he severs the cable with a few shots. The counterweight ' - 'drops, propelling Neo and Trinity upward. The elevator falls to the lobby ' - 'exploding upon impact and filling the floor with flames.\r\n' - '\r\n' - 'The Agents feel the rumble of the explosion and the sprinkers come on in the ' - 'building. "Find them and destroy them!" Smith commands.\r\n' - '\r\n' - 'On the roof, a helicopter pilot is calling "Mayday" as Trinity and Neo take ' - 'out the soldiers there. Agent Brown takes over the pilot and appears behind ' - 'Neo. Neo shoots several rounds at the Agent, who dodges them and pulls his ' - 'own weapon.\r\n' - '\r\n' - '"Trinity," yells Neo, "Help!" But it\'s too late. The Agent begins to shoot. ' - 'Instead of being shot, Neo dodges most of the bullets, though two of them ' - 'nick him. As the Agent approaches Neo, who is lying on the ground, he levels ' - 'a kill shot but Trinity shoots him before he can fire. Trinity marvels at ' - "how fast Neo has just moved; she's never seen anyone move that quickly.\r\n" - '\r\n' - 'Tank downloads the ability to fly the helicopter to Trinity, who can now ' - 'pilot the aircraft. Trinity brings the helicopter down to the floor that ' - 'Morpheus is on and Neo opens fire on the three Agents. The Agents quickly ' - 'fall and Morpheus is alone in the room. Just as quickly the Agents take over ' - 'other soldiers stationed nearby. Morpheus breaks his bonds and begins to run ' - 'to the helicopter. The Agents fire on him, hitting his leg. Morpheus leaps ' - 'but Neo realizes that he is not going to make the leap and throws himself ' - 'out of the helicopter, a safety harness attached.\r\n' - '\r\n' - "He catches Morpheus, but Agent Smith shoots the helicopter's hydraulic " - 'line.\r\n' - '\r\n' - 'Unable to control the helicopter, Trinity miraculously gets it close enough ' - 'to drop Morpheus and Neo on a rooftop. Neo grabs the safety line as the ' - 'helicopter falls towards a building. Trinity severs the safety line ' - 'connecting Neo to the helicopter and jumps on it herself as the vehicle ' - 'smashes into the side of a building, causing a bizarre ripple in the fabric ' - "of the building's reality as it does.\r\n" - '\r\n' - 'On the ship Tank says, "I knew it; he\'s the One."\r\n' - '\r\n' - 'Neo hauls Trinity up to them. "Do you believe it now, Trinity?" asks ' - 'Morpheus as he approaches the two. Neo tries to tell him that the Oracle ' - 'told him the opposite but Morpheus says, "She told you exactly what you ' - 'needed to hear." They call Tank, who tells them of an exit in a subway near ' - 'them.\r\n' - '\r\n' - 'The Agents arrive on the rooftop but find only the safety harness and line. ' - 'Though Agent Smith is angered, the other two are satisfied. A trace has been ' - 'completed in the real world and the Sentinels have been dispatched to attack ' - 'the Nebuchadnezzar.\r\n' - '\r\n' - 'In the subway, they quickly find the phone booth and Morpheus exits out of ' - 'the Matrix. A wino watches this occur. On the rooftop Agent Smith locks in ' - 'to their whereabouts through the wino and appropriates his body.\r\n' - '\r\n' - "Meanwhile, as the phone rings, providing Trinity's exit, she confides to Neo " - 'that everything that the Oracle has told her has come true, except for one ' - "thing. She doesn't say what that thing is and picks up the phone just as she " - 'sees the approaching Agent Smith. Smith shatters the ear piece of the phone; ' - "it's impossible for Neo to exit there now.\r\n" - '\r\n' - 'Instead of running, which Trinity implores him to do as she looks on from ' - 'the ship, Neo turns to face Smith. They empty their guns on each other, ' - 'neither hitting the other. They then move into close combat, trading blows. ' - 'Neo sweeps Agent Smith\'s head, breaking his glasses. "I\'m going to enjoy ' - 'watching you die, Mr. Anderson," says Smith. They trade some thunderous ' - 'blows with Smith hitting Neo so hard he spits up blood in the Matrix and in ' - 'the chair aboard the ship.\r\n' - '\r\n' - '"He\'s killing him," says Trinity.\r\n' - '\r\n' - 'Neo gets back up, sets himself and beckons Smith to start again. This time ' - "it's Neo who delivers devastating blow after blow. But Smith counters, " - 'throwing Neo into a wall then pummeling him with body blows. A wind from the ' - 'tunnel signals that a subway train is approaching and Smith has a wicked ' - 'notion. He throws Neo into the subway tracks then drops down there himself. ' - 'He puts Neo in a headlock and, in the glow of the oncoming subway says, "You ' - 'hear that, Mr. Anderson? That is the sound of inevitability. It is the sound ' - 'of your death. Good-bye, Mr. Anderson."\r\n' - '\r\n' - '"My name," he replies, "is Neo." Then, with a mighty leap, Neo propels them ' - 'to the ceiling of the tunnel. They fall back down and Neo backflips off the ' - 'tracks, leaving Agent Smith to the oncoming train.\r\n' - '\r\n' - 'Neo heads for the stairs, but Smith has already appropriated another body ' - 'and emerges from the doors of the train.\r\n' - '\r\n' - 'Meanwhile the Sentinels have arrived to attack the Nebuchadnezzar; there are ' - 'five of them and they are closing fast.\r\n' - '\r\n' - 'Morpheus tells Tank to charge the EMP. Trinity reminds Morpheus that they ' - "can't use the EMP while Neo is in the Matrix.\r\n" - '\r\n' - '"I know, Trinity, don\'t worry," says Morpheus, "He\'s going to make it."\r\n' - '\r\n' - 'Back in the streets of the Matrix, Neo swipes a cell phone from a nearby ' - 'suit. He calls Tank: "Mr. Wizard, get me the hell out of here." He races ' - 'through a crowded market while Agents appropriate bodies right and left. ' - 'They force Neo down a dark alley. He kicks in a door and rushes through an ' - 'apartment complex where the Agents appropriate more bodies, including that ' - 'of a sweet little old lady who throws a knife at Neo as Agent Smith. Neo ' - 'leaps down into a pile of garbage with the Agents in hot pursuit.\r\n' - '\r\n' - 'On the Nebuchadnezzar the Sentinels have arrived. They begin to tear the ' - 'ship apart.\r\n' - '\r\n' - "In the Matrix, Neo arrives back at the Heart O' the City Hotel. Tank tells " - 'him to go to room 303. The Agents are literally at his heels.\r\n' - '\r\n' - 'The Sentinels breach the hull of the ship. They are inside. Trinity, ' - "standing next to Neo's body in the chair, begs him to hurry.\r\n" - '\r\n' - "Neo reaches room 303 and enters. He's immediately shot, point blank in the " - "gut, by Agent Smith. Smith empties his magazine into Neo's body. Neo slumps " - 'to the floor, dead.\r\n' - '\r\n' - 'On the ship Neo\'s vital signs drop to nothing. "It can\'t be," says ' - 'Morpheus.\r\n' - '\r\n' - 'Agent Smith instructs the others to check Neo. "He\'s gone," one replies. ' - '"Good-bye, Mr. Anderson," says Smith.\r\n' - '\r\n' - "The Sentinels' lasers are beginning to cut through the major parts of the " - 'hovercraft. Trinity leans over his dead body.\r\n' - '\r\n' - '"Neo," she says, "I\'m not afraid anymore. The Oracle told me that I would ' - 'fall in love and that that man... the man that I loved would be the One. So ' - "you see, you can't be dead. You can't be... because I love you. You hear me? " - 'I love you." She kisses him. In the chair Neo suddenly breathes. In the ' - 'Matrix, Neo opens his eyes. "Now get up," orders Trinity.\r\n' - '\r\n' - 'The Agents hear Neo rise behind them and they open fire. "No," Neo says ' - 'calmly, raising his hands. He stops their bullets in mid-air. They drop ' - 'harmlessly to the floor.\r\n' - '\r\n' - '"What\'s happening?" asks Tank. "He is the One," says Morpheus.\r\n' - '\r\n' - 'Back in the Matrix, Neo can see things for what they really are, green ' - 'cascading code.\r\n' - '\r\n' - "Agent Smith is furious. He runs to Neo and attacks him. Neo blocks Smith's " - 'blows effortlessly before he sends Smith flying with one well-placed kick. ' - "Neo then leaps into Smith's body and appropriates him. Smith's shell " - 'explodes in a sea of code and Neo is all that is left, the walls buckling in ' - 'waves as they did when the helicopter crashed. Agents Brown and Jones look ' - 'at one another and run away.\r\n' - '\r\n' - 'The Sentinels are now fully in the ship. They are right above Trinity and ' - 'Morpheus.\r\n' - '\r\n' - 'Back in the Matrix Neo sprints to the ringing phone in the room.\r\n' - '\r\n' - 'Morpheus has no choice but to engage the EMP. He does and the Sentinels fall ' - 'inert to the floor.\r\n' - '\r\n' - 'Neo has made it back. He kisses Trinity.\r\n' - '\r\n' - 'The screen is black. A command prompt appears: "Call trans opt: received. ' - '9-18-99 14:32:21 REC: Log>" then "Carrier anomaly" "Trace program: running" ' - 'As the grid of numbers appears again a warning appears "System Failure." ' - "Over it all is Neo's voice:\r\n" - '\r\n' - '"I know you\'re out there. I can feel you now. I know that you\'re afraid... ' - "you're afraid of us. You're afraid of change. I don't know the future. I " - "didn't come here to tell you how this is going to end. I came here to tell " - "you how it's going to begin. I'm going to hang up this phone, and then I'm " - "going to show these people what you don't want them to see. I'm going to " - 'show them a world without you. A world without rules and controls, without ' - 'borders or boundaries. A world where anything is possible. Where we go from ' - 'there is a choice I leave to you."\r\n' - '\r\n' - 'In the Matrix world, Neo hangs up the phone. He looks at the mindless masses ' - 'around him, puts on his glasses and then looks up. From high above the city ' - 'we see him take flight. The story is picked up in The Matrix Reloaded, the ' - 'second of three Matrix movies.\r\n' - '\r\n') - - -First, the summary - - - -.. code-block:: default - - print(summarize(text, ratio=0.01)) - - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('Anderson, a software engineer for a Metacortex, the other life as Neo, a ' - 'computer hacker "guilty of virtually every computer crime we have a law ' - 'for." Agent Smith asks him to help them capture Morpheus, a dangerous ' - 'terrorist, in exchange for amnesty.\n' - "Morpheus explains that he's been searching for Neo his entire life and asks " - 'if Neo feels like "Alice in Wonderland, falling down the rabbit hole." He ' - 'explains to Neo that they exist in the Matrix, a false reality that has been ' - 'constructed for humans to hide the truth.\n' - "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian " - 'Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a ' - 'goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray ' - 'Parker); and a young, thin man named Mouse (Matt Doran).\n' - 'Trinity brings the helicopter down to the floor that Morpheus is on and Neo ' - 'opens fire on the three Agents.') - - -And now, the keywords: - - - -.. code-block:: default - - print(keywords(text, ratio=0.01)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - 'neo\nmorpheus\ntrinity\ncypher\nsmith\nagents\nagent\ntank\nsays\nsaying' - - -If you know this movie, you see that this summary is actually quite good. We -also see that some of the most important characters (Neo, Morpheus, Trinity) -were extracted as keywords. - -Another example ---------------- - -Let's try an example similar to the one above. This time, we will use the IMDb synopsis -`The Big Lebowski `_. - -Again, we download the text and produce a summary and some keywords. - - - -.. code-block:: default - - - - text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text - print(text) - print(summarize(text, ratio=0.01)) - print(keywords(text, ratio=0.01)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - ('A tumbleweed rolls up a hillside just outside of Los Angeles as a mysterious ' - 'man known as The Stranger (Sam Elliott) narrates about a fella he wants to ' - 'tell us about named Jeffrey Lebowski. With not much use for his given name, ' - 'however, Jeffrey goes by the name The Dude (Jeff Bridges). The Stranger ' - 'describes Dude as one of the laziest men in LA, which would place him "high ' - 'in the running for laziest worldwide", but nevertheless "the man for his ' - 'place and time."\r\n' - '\r\n' - 'The Dude, wearing a bathrobe and flips flops, buys a carton of cream at ' - "Ralph's with a post-dated check for 69 cents. On the TV, President George " - 'Bush Sr. is addressing the nation, saying "aggression will not stand" ' - 'against Kuwait. Dude returns to his apartment where, upon entering and ' - 'closing the door, he is promptly grabbed by two men who force him into the ' - 'bathroom and shove his head in the toilet. They demand money owed to Jackie ' - "Treehorn, saying that The Dude's wife Bunny claimed he was good for it, " - "before one of the thugs, Woo (Philip Moon), urinates on The Dude's rug " - 'saying, "Ever thus to deadbeats, Lebowski!" Bewildered, Dude convinces them ' - "that they have the wrong person as he's not married and can't possibly " - "possess the amount of money they're asking. Looking around, the first thug, " - "(Mark Pellegrino), realizes they've made a mistake and must have the wrong " - 'Lebowski. Regardless, they break one of his bathroom tiles before leaving. ' - '"At least I\'m housebroken", Dude calls after them.\r\n' - '\r\n' - 'Dude meets up with his bowling team at the local alley and talks to them ' - 'about his violent encounter. Walter Sobchak (John Goodman) reacts with anger ' - 'and vengeance on his mind, often speaking of his time served in Vietnam to ' - "relate to the issue. Slow-witted Theodore Donald 'Donny' Kerabatsos (Steve " - 'Buscemi), often entering conversations halfway through, pipes in but is ' - 'promptly told by Walter, "You\'re out of your element". Walter then tells ' - "Dude about a millionaire who shares Dude's name and must be the one the " - 'thugs were after. Dude agrees to meet with the Big Lebowski, hoping to get ' - 'compensation for his rug since it "really tied the room together" and ' - "figures that his wife, Bunny, shouldn't be owing money around town.\r\n" - '\r\n' - "Arriving at Lebowski's mansion, Dude is assisted by Brandt (Philip Seymour " - "Hoffman) who shows him numerous awards and pictures illustrating Lebowski's " - 'endeavors in philanthropy before Dude meets the man himself. The elder and ' - 'wheelchair-bound Lebowski (David Huddleston) brings Dude into his study ' - "where he quickly gets to the point and professes that he can't take " - 'responsibility for every spoiled rug in the city and accuses Dude of seeking ' - 'a handout, clearly resentful of his hippie-like demeanor. Dude leaves the ' - 'room and tells Brandt that Lebowski offered any rug in the house to him. He ' - "quickly picks one out and, as it's being loaded into Dude's car, he speaks " - 'to a young blonde (Tara Reid) poolside who is painting her toenails green. ' - 'She asks Dude to blow on her toes, assuring him that Uli (Peter Stormare), ' - "the man in the pool, won't mind because he's a nihilist. Brandt appears and " - 'introduces her as Bunny Lebowski before she offers Dude fellatio for $1000. ' - 'Brandt nervously laughs and escorts Dude out.\r\n' - '\r\n' - 'During a league game at the alley, Dude scolds Walter for bringing his ' - "ex-wife's small dog in a kennel with him while she is in Hawai'i with her " - 'new boyfriend. As they debate, a member of the opposite team, Smokey (Jimmie ' - 'Dale Gilmore), bowls an 8 and tells the Dude to mark it, but Walter objects, ' - "stating Smokey's foot was over the line. When Smokey argues, Walter pulls " - "out a gun and aims it in Smokey's face, forcing him to comply and void the " - 'score as a zero. As Walter sits down again, he explains, "It\'s a league ' - 'game, Smokey, there are rules". Dude scolds Walter as they leave, trying to ' - 'act casual as police units arrive and run past them into the alley.\r\n' - '\r\n' - 'Afterwards, relaxing in his apartment and enjoying a White Russian (his ' - 'favorite cocktail), Dude listens to his phone messages: Smokey calling to ' - 'talk about the gun incident, Brandt asking Dude to call him, and the bowling ' - "league administrator wishing to speak about Walter's belligerence and " - "gun-brandishing on the lanes. Dude's doorbell rings and his landlord, Marty " - "(Jack Kehler), reminds Dude to pay his rent and informs him that he's " - 'performing a dance at a local theater and would like Dude to attend to give ' - 'him notes. The Dude obliges as Brandt rings again, telling Dude that ' - "Lebowski needs to see him and that it's not about the rug.\r\n" - '\r\n' - 'At the Lebowski mansion, Brandt solemnly leads Dude into the study where he ' - 'finds Lebowski crying beside the lit fireplace. He shows Dude a crude note ' - 'describing Bunny\'s kidnapping and the demand for $1 million. "This is a ' - 'bummer, man," the Dude offers as he smokes a joint. Brandt explains that ' - 'they want Dude to act as courier to deliver the payment when they receive ' - 'word of a location for the drop off and tells Dude that he might even ' - 'recognize the kidnappers as the same people who soiled his rug.\r\n' - '\r\n' - 'Back at the bowling alley, a man wearing a hairnet and a purple jumpsuit ' - "with 'Jesus' embroidered on the front bowls a perfect strike. A few lanes " - 'down, Dude, Donny, and Walter watch him with slight resentment. Dude ' - "compliments on Jesus' (John Turturro) skill but Walter criticizes him for " - "being a 'pederast', having served six months for exposing himself to an " - 'eight year-old before asking Dude about the Lebowski arrangement. Dude ' - 'explains that he will receive $20,000 as courier and shows Walter the beeper ' - "Brandt gave him. He doesn't worry about the hand off and figures that Bunny " - "kidnapped herself for some extra money. Walter seems to take Bunny's offense " - 'personally as Jesus walks over, telling them to watch out for his team and ' - 'if they flash a piece at the finals "I\'ll take it away from you, stick it ' - 'up your ass and pull the fucking trigger till it goes click."\r\n' - '\r\n' - 'At his apartment, Dude lies happily on his new rug, listening to a taped ' - 'bowling game through headphones. He opens his eyes and sees a woman and two ' - 'men standing over him before he is punched in the face and knocked out. He ' - 'dreams that he is flying over LA, chasing a woman who is riding his rug ' - 'ahead of him. A bowling ball suddenly appears in his hand and pulls him to ' - 'the ground where he stands, miniaturized, facing a gigantic bowling ball as ' - 'it rolls towards him. He tenses and winds up in one of the finger holes of ' - 'the ball. From his perspective, we see the ball roll down the lane away from ' - 'its female bowler towards the pins. As the pins scatter, the Dude wakes up ' - 'to the sound of his beeper going off and finds that his rug has been taken ' - 'from underneath him.\r\n' - '\r\n' - "Answering the page, Dude returns to Lebowski's mansion where Brandt explains " - 'that the kidnappers want the exchange to happen that very night. He gives ' - 'Dude a portable phone and a briefcase with the money, instructing him to ' - 'take it up the highway and wait for the kidnappers to call. Once the ' - 'exchange is complete, Dude is to call Brandt immediately. Before he leaves, ' - 'Brandt repeats to Dude that "her life is in your hands".\r\n' - '\r\n' - "Despite Brandt's instructions to go alone, Dude picks up Walter from his " - 'store. Walter gets in the drivers seat and immediately proposes a plan for a ' - 'switch, holding his own briefcase full of dirty underwear, so that he and ' - 'Dude can keep the million themselves. Walter also plans to capture one of ' - "the kidnappers and beat Bunny's location out of him. Dude is adamantly " - 'against the crazy plan but when the kidnappers call, Dude accidentally lets ' - "slip that he's not alone. The kidnappers hang up and Dude panics that Bunny " - 'is as good as dead, though Walter reminds him of his own suspicions that ' - 'Bunny kidnapped herself. The kidnappers call again and give a location ' - "granted there is no funny 'schtuff'. At the designated location, the " - 'kidnappers call and instruct The Dude to throw the suitcase out the car ' - 'window onto a bridge. As they approach the bridge, Dude tries to throw the ' - 'real suitcase but, at the last second, Walter tosses the ringer and forces ' - 'Dude to take the wheel as he arms himself with an Uzi and bails out of the ' - 'moving car. Despite his seemingly flawless and heroic plan, Walter loses ' - "grip of the Uzi and it fires wildly, hitting Dude's tail lights and tires, " - 'causing him to panic and crash into a telephone pole. Three men on ' - 'motorcycles appear just beyond the bridge and, as Dude scrambles out of the ' - 'car with the briefcase, pick up the ringer and ride off. Walter calmly gets ' - 'up and says, "Fuck it, Dude. Lets go bowling".\r\n' - '\r\n' - 'At the alley, the portable phone rings incessantly, no doubt Brandt calling ' - 'to check on the mission. Dude is miserable, angry at Walter, and certain ' - 'that Bunny will be killed, though Walter is calm and convinced that Bunny ' - 'kidnapped herself. He tells Dude not to worry and that Bunny will eventually ' - 'get bored and return home on her own but becomes dismayed to see that the ' - 'bowling schedule has him playing on Saturday; something he is forbidden to ' - 'do since he is Shomer Shabbos and must honor the Jewish day of rest. The ' - "Dude wonders why Walter didn't go back to being Catholic since he only " - 'converted for his ex-wife. Donny interjects mid-conversation and is, again, ' - "told to 'shut the fuck up' by Walter.\r\n" - '\r\n' - 'As they leave, Dude discovers his car missing - along with the briefcase. ' - 'Walter suggests it was towed because they parked in a handicapped spot but ' - 'Dude is certain that it was stolen. He starts walking home with his phone ' - 'ringing.\r\n' - '\r\n' - 'Dude resolves to call the police and issue a statement for his stolen car. ' - 'Two police officers (Richard Gant, Christian Clemenson) arrive at his ' - 'apartment to take notes and Dude addresses the separate issue of his missing ' - 'rug just before his home phone rings. The answering machine records a woman ' - 'introducing herself as Maude Lebowski and saying that she is the one who ' - 'took his rug and has sent a car to pick Dude up at his apartment. The ' - 'younger of the two cops is pleased that the missing rug issue is ' - 'resolved.\r\n' - '\r\n' - 'The Dude is brought to a huge loft studio filled with canvases and minimal ' - 'illumination. As he walks in, he is startled by the sudden appearance of ' - 'Maude, swinging in naked on a zip line, screaming and flailing paintbrushes ' - 'over a large canvas to create an abstract image. She descends to the ground ' - 'and is robed before addressing The Dude. She explains that she is a ' - 'professional artist whose work is commended as strongly vaginal, often to ' - 'the point of making some men uncomfortable. She tells Dude that the rug he ' - 'took was a gift from her to her late mother and her father, Big Lebowski, ' - "had no right giving it away. Maude's flamboyant assistant, Knox Harrington " - '(David Thewlis), watches as Dude fixes himself a White Russian and Maude ' - 'puts a tape in her VCR. She asks Dude if he enjoys sex as the video rolls, a ' - 'smut film starring Bunny Lebowski and Uli, the German nihilist, credited as ' - 'Karl Hungus. Maude surmises that Bunny kidnapped herself, elaborating on the ' - 'already obvious notion that she gets around and even bangs the producer of ' - 'the film, Jackie Treehorn. As one of two trustees of Little Lebowski Urban ' - "Achievers, one of Lebowski's charity programs, Maude noticed a withdrawal of " - '$1 million from its funds and was told it was for the ransom. Though she is ' - "more or less estranged from her father, she doesn't want to involve the " - 'police in his embezzlement and offers the Dude ten percent of the million if ' - "he retrieves the money from the kidnappers. With a finder's fee she tells " - 'him he can buy a new rug. She then apologizes for the crack on the jaw and ' - 'gives The Dude a number for a doctor who will examine him free of charge.\r\n' - '\r\n' - 'The Dude is given a limo ride back to his apartment where the driver (Dom ' - 'Irrera) points out a blue Volkswagen Beetle that had been following them. ' - "Before The Dude has a chance to do anything about it, he's shoved into " - 'another limo waiting for him on the street. Inside, Brandt and Lebowski ' - 'confront him about the fact that he never called them and yell that the ' - 'kidnappers never got the money. Lebowski accuses Dude of stealing the ' - "million himself as Dude tries to reason that the 'royal we' dropped off the " - 'money and that Bunny, since she apparently owes money all over town, most ' - 'likely kidnapped herself and probably instructed her kidnappers to lie about ' - 'the hand off. Brandt and Lebowski look skeptical before producing an ' - 'envelope. Lebowski tells Dude that the kidnappers will be dealing directly ' - 'with him now and any mishaps will be avenged tenfold on him. Inside the ' - 'envelope, Dude finds a severed pinky toe wrapped in gauze with green polish ' - 'on the nail.\r\n' - '\r\n' - "In a small cafe, The Dude tells Walter about the severed toe who doesn't " - "believe it's Bunny's. Walter calls the kidnappers a bunch of fucking " - "amateurs for using such an obviously fake ruse but The Dude isn't convinced. " - 'Walter tries to convince him by saying that he can get a toe for him in no ' - "time at all and with his choice of nail polish color. Despite Walter's " - 'unwavering stance, Dude fears for his life; if the kidnappers dont get him, ' - 'Lebowski will.\r\n' - '\r\n' - 'At home, he tries to relax in the tub, smoking a joint and listening to ' - 'music. His phone rings and the answering machine records the LAPD telling ' - "him that they've recovered his car. Dude is overjoyed for a moment until he " - 'hears a loud banging in his living room. He looks up to see three men ' - 'breaking into his apartment wearing dark clothes. The leader, whom Dude ' - 'recognizes as Uli/Karl Hungus the nihilist, along with his two cohorts, ' - 'Franz and Kieffer (Torsten Voges, Flea), enters the bathroom with a ferret ' - 'on a leash. He dunks the terrified animal in the tub where it thrashes and ' - 'shrieks as Dude tries to avoid it. Uli takes the ferret out, letting it ' - "shake off, and tells Dude that they want their money tomorrow or they'll cut " - 'off his johnson.\r\n' - '\r\n' - 'The following morning, the Dude goes to the impound lot to collect his car ' - 'which turns up badly damaged and reeking with a terrible stench, an apparent ' - 'victim of a joyride and temporary home to some vagrants. The briefcase is ' - 'gone. Dude asks the officer at the lot if anyone is following up on who ' - 'might have taken the car, but the officer (Mike Gomez) chuckles and ' - 'sarcastically says that their department has them working in shifts on the ' - 'case.\r\n' - '\r\n' - 'At the bar in the bowling alley, Dude expresses his fears to an ' - 'unsympathetic Walter and an unhelpful Donny. Unable to cheer him up, they ' - 'leave Dude at the bar to find an open lane. The Stranger sits down next to ' - 'Dude and orders a sarsaparilla before chatting briefly with Dude, ' - 'complimenting him on his style and wondering why he uses so many cuss words. ' - 'He offers Dude one piece of advice before leaving: "Sometimes you eat the ' - 'bar, and sometimes the bar, well, he eats you." Gary, the bartender (Peter ' - "Siragusa), hands Dude the phone; it's Maude. She's miffed that Dude hasn't " - 'seen the doctor yet and instructs him to meet her at her loft. There, Dude ' - 'informs Maude that he thinks Bunny was really kidnapped, possibly by Uli. ' - 'Maude disagrees, saying that Bunny knows Uli and kidnappers cannot be ' - 'acquaintances. She then dismisses Dude to take a call, reminding him to see ' - 'the doctor.\r\n' - '\r\n' - 'At the clinic the doctor tells Dude to remove his shorts, insisting despite ' - "Dude's assurance that he was only hit in the face. Driving home, Dude enjoys " - 'a joint while listening to Creedence but soon notices a blue Volkswagen ' - 'following him. Distracted, he tries to flick his joint out the window but it ' - 'bounces back and lands in his lap, burning him. He screams and dumps beer on ' - 'his lap before he swerves and crashes into a dumpster. When he looks out the ' - 'window, the blue car is gone. Looking down, he notices a piece of paper ' - "stuck in the car seat. It's a graded homework sheet with the name Larry " - 'Sellers written on it.\r\n' - '\r\n' - "That night, at Marty's dance quartet, Walter reveals that he's done some " - 'research on Larry and discovered where he lives, near the In-N-Out Burger ' - "joint. He is also thrilled to report that Larry's father is Arthur Digby " - 'Sellers, a famous screenwriter who wrote 156 episodes of the show Branded. ' - 'Walter is certain that Larry has the briefcase of money and that their ' - 'troubles are over. They pull up to the house where The Dude is dismayed to ' - 'see a brand new red Corvette parked on the street outside. A Hispanic ' - "housekeeper (Irene Olga López) lets them into the Sellers' home where they " - 'see the elderly Arthur Sellers (Harry Bugin) in an iron lung in the living ' - "room. Over the hissing of the compressor, Walter calls out that he's a big " - "fan of Arthur's and that his work was a source of inspiration to him before " - 'the housekeeper brings in young Larry (Jesse Flanagan), a fifteen year-old ' - 'with a deadpanned expression. Walter and Dude interrogate Larry about the ' - "money and the fact that he stole Dude's car, but get no response. Not even a " - 'wavering glance. Walter resolves to go to Plan B; he tells Larry to watch ' - 'out the window as he and Dude go back out to the car where Donny is waiting. ' - 'Walter removes a tire iron from Dudes trunk and proceeds to smash the ' - 'corvette, shouting, "This is what happens when you fuck a stranger in the ' - 'ass!"\r\n' - '\r\n' - "However, the car's real owner (Luis Colina) comes out of his house and rips " - 'the tire iron from Walter, shouting that he just bought the car last week, ' - "before going over to The Dude's car and breaking all the windows. Dude " - 'drives silently home, wind blowing in through the broken windows, as Walter ' - 'and Donny eat In-N-Out burgers.\r\n' - '\r\n' - 'Back home, Dude talks to Walter over the phone as he nails a two-by-four to ' - 'the floor near the front door. He yells at Walter, telling him to leave him ' - 'alone and that he wants to handle the situation himself before agreeing to ' - 'go to their next bowling practice. He hangs up and props a chair against the ' - 'door, braced by the piece of wood, and turns away as the door opens ' - "outwardly and Treehorn's thugs from the beginning of the film walk in. They " - 'tell The Dude that Jackie Treehorn wishes to meet with him.\r\n' - '\r\n' - 'The Dude is taken to a large mansion overlooking a beach front where a ' - 'tribal, orgy-like party is going on. Inside, Dude meets Jackie Treehorn (Ben ' - 'Gazzara) who appears friendly and agreeable as he mixes the Dude a White ' - 'Russian and sympathizes for his lost rug. Treehorn asks him where Bunny is ' - 'to which Dude responds that he thinks Treehorn knows. Treehorn denies ' - 'knowing and theorizes that Bunny ran off knowing how much money she owed ' - 'him. Treehorn is then excused for a phone call. He writes something down on ' - 'a notepad before leaving the room momentarily. Employing the Roger O. ' - 'Thornhill trick of rubbing a pencil lightly over the pad of paper to see ' - 'what was written, Dude reveals a doodle of a man with a rather large penis. ' - 'He rips the paper out of the pad and sticks it in his pocket before ' - 'returning to the couch as Treehorn comes back. He offers Dude a ten percent ' - "finder's fee if he tells them where the money is. Dude tells him that Larry " - 'Sellers should have the money, though Treehorn is not convinced. Dude ' - "insists he's telling the truth as his words begin to slur and his vision " - 'glazes over. He mumbles, "All the Dude ever wanted was his rug back...it ' - 'really tied the room together," before he passes out.\r\n' - '\r\n' - 'The Dude falls into a deep dream where he sees himself happily starring in a ' - "Jackie Treehorn-produced bowling picture entitled 'Gutterballs' with Maude, " - 'dressed in a seducing Viking outfit, as his costar. They dance together and ' - 'throw a bowling ball down the lane. The ball turns into the Dude, floating ' - "above the lane floor and passing under ladies' skirts. When he hits the pins " - 'at the end, he suddenly sees the three nihilists dressed in tight clothes ' - 'and snapping super large scissors, chasing him. He runs from them, ' - 'terrified, as he wakes from his dream, staggering down a street in Malibu ' - 'while a police car pulls up behind him. The unit picks him up as he slurs ' - "the theme song to 'Branded'.\r\n" - '\r\n' - 'At the Malibu police station, the chief of police (Leon Russom) goes through ' - "The Dude's wallet before he tells Dude that Jackie Treehorn said he was " - "drunk and disorderly at his 'garden party'. He tells Dude that Treehorn is " - 'an important source of income in Malibu and demands that he stay out of the ' - "town for good. Dude replies that he wasn't listening which incites the chief " - 'to throw his coffee mug at him, hitting him in the head. Dude takes a cab ' - 'ride home and requests that the driver (Ajgie Kirkland) change the radio ' - "station since he had a rough night and hates the Eagles. The driver doesn't " - 'take kindly to this and throws The Dude out. As he stands on the street, a ' - "red convertible passes by at high speeds; it's Bunny listening to 'Viva Las " - "Vegas' and, as we see, with a complete set of toes on each foot.\r\n" - '\r\n' - 'Dude returns to his apartment to find it completely wrecked. He enters and ' - 'trips over the two-by-four he nailed into the floor. When he looks up, he ' - 'finds Maude standing before him dressed in nothing but his robe. She drops ' - 'it to the floor and tells him to make love to her. Afterwards, they lie in ' - 'bed together as The Dude smokes a joint and tells her about his past as a ' - 'student activist and his current hobbies which include bowling and the ' - 'occasional acid flashback. As he climbs out of bed to make a White Russian, ' - "Maude asks about the apartment and Dude explains that Treehorn's thugs most " - "likely vandalized it looking for Lebowski's money. Maude retorts that her " - "father actually has no money; it was all her mother's or else belongs to the " - "Foundation and that Lebowski's only concern is to run the charities. Maude " - 'gives him an allowance but his weakness is vanity; "Hence the slut". She ' - 'tells Dude this as she folds into a yoga position which she claims increases ' - 'the chances of conception. Dude chokes on his drink but Maude assures him ' - 'that she has no intention of having Dude be a part of the child-bearing ' - "process nor does she want to see him socially. The Dude then figures that's " - 'why she wanted him to visit the doctor so badly until an idea suddenly comes ' - 'to mind about Lebowski. Dude calls Walter to pick him up and take him to ' - "Lebowski's mansion right away, despite Walter's protests that he doesn't " - "drive on Shabbos unless it's an emergency. Dude assures him that it's just " - 'that.\r\n' - '\r\n' - 'Dude dresses and goes outside where he sees the blue Volkswagen parked just ' - 'down the street. He walks over and demands that the man within get out. The ' - 'man introduces himself as Da Fino (Ajgie Kirkland) and explains that he ' - 'thinks Dude is a fellow private eye who is brilliantly playing two sides ' - 'against each other; the thugs and Lebowski, and means no harm to him or his ' - "girlfriend. Confused, Dude tells Da Fino to stay away from his 'lady friend' " - "and asks if he's working for Lebowski or Treehorn. Da Fino admits that he's " - "employed by the Kneutson's; Bunny's family. Apparently, Bunny's real name is " - "Fawn and she ran away from her Minnesota home a year ago and Da Fino's been " - 'investigating since. As Walter pulls up, Dude tells Da Fino to, again, stay ' - 'away from his lady friend and leaves.\r\n' - '\r\n' - 'At a local restaurant, the three German nihilists and a sallow, blonde woman ' - '(Aimee Mann) sit together ordering pancakes. The camera pans down to the ' - 'womans foot covered in a bandage which, where her pinky toe should be, is ' - 'soaked in dried blood.\r\n' - '\r\n' - 'Driving out to Lebowski mansion, Dude explains his new theory; why did ' - 'Lebowski do nothing to him if he knew the payoff never happened? If Lebowski ' - "thought that The Dude took the money, why didn't he ask for it back? Because " - 'the briefcase given to Dude was never full of money: "You threw a ringer out ' - 'for a ringer!" He also figures that Lebowski chose him, an otherwise ' - "'fuck-up', to get Bunny back because he never wanted her back; he wanted her " - 'dead while he embezzled money from the foundation as a ransom. Walter agrees ' - "with the theory but still believes he shouldn't have been bothered on the " - 'Shabbos.\r\n' - '\r\n' - "As they pull up to the mansion, they see Bunny's red convertible crashed " - 'into some shrubbery near the front fountain. Bunny is running around the ' - 'grounds naked while, inside, Brandt attempts to pick up her discarded ' - 'clothes. He tells them that Bunny went to visit friends in Palm Springs ' - 'without telling anyone. Despite his protests, Walter and Dude walk past him ' - 'into the study where a stern-looking Lebowski sits. Dude demands an answer; ' - 'he accuses Lebowski of keeping the million for himself while he used The ' - 'Dude as a scapegoat to cover up for the missing money. Lebowski says that ' - "it's his word against Dude's and no one would believe a 'deadbeat' over him. " - 'This angers Walter who figures Lebowski to be a fake handicap besides a ' - 'phony millionaire and lifts Lebowski out of his chair, dropping him to the ' - 'floor. However, Lebowski lies still on the floor, whimpering, and Dude tells ' - 'Walter to help him back in his chair.\r\n' - '\r\n' - 'At the bowling alley, Donny misses a strike for the first time and puzzles ' - "over this as Walter drones about Vietnam to Dude who doesn't seem to be " - 'paying attention as he paints over his fingernails with clear polish. Jesus ' - 'walks over, criticizing the change in schedule from Saturday to Wednesday ' - 'before issuing sexual threats. The Dude, Walter, and Donny sit unfazed. As ' - 'they leave the alley and head into the parking lot, they are faced by the ' - 'three nihilists who stand in front of The Dude\'s flaming car. "Well, they ' - 'finally did it," he despairs. "They killed my fucking car."\r\n' - '\r\n' - 'The nihilists demand the money or they will kill the girl but Dude tells ' - 'them that he knows they never had the girl in the first place. The nihilists ' - "reply that they don't care and still want the money but Dude tries to " - "explain that Lebowski's money was never valid; he never intended to pay them " - 'off and Walter shouts that without a hostage, there is no ransom. Franz ' - 'complains that his girlfriend had to give up her pinky toe because she ' - "thought she was getting $1 million but they'll settle for whatever Walter, " - 'Donny, and Dude have in their pockets. Donny, in the back, asks if the men ' - "are going to hurt them and Walter assures him that they're nihilists and " - 'cowards as Dude pulls out his wallet. When Walter refuses to take his own ' - 'out, Uli pulls out a sword and Walter engages in a fight with them, throwing ' - "his bowling ball into Franz's stomach. Dude hits Kieffer over the head with " - 'his own radio while Walter attacks Uli and bites off his ear, spitting it ' - 'into the air. He turns around and sees Donny on the ground, clutching his ' - 'chest from having a heart attack. Walter comforts him as Dude runs into the ' - 'alley to call for an ambulance.\r\n' - '\r\n' - 'The Dude and Walter are then seen at a funeral parlor speaking with the ' - 'curator. Donny, having passed away, was cremated and they negotiate how his ' - 'remains will be handled. Walter is outraged at the high price of the urn. ' - 'The curator tells them that the urn is their most "modestly-priced ' - 'receptacle" and that the ashes must be given over in a container of some ' - "sort. Walter asks if there's a Ralph's store nearby and he & The Dude " - "resolve to receive Donny's ashes in a Folger's coffee can. They travel " - 'together to a windy cliffside overlooking the ocean where Walter gives a ' - 'heartfelt speech about Donny along with a seemingly unrelated reference to ' - 'Vietnam before opening the can and shaking out the ashes. The wind blows ' - "them back into Dude's face, coating his clothes, beard, and sunglasses. " - 'Walter apologizes and attempts to brush the ashes off but the Dude yells at ' - "him for always making everything a 'fucking travesty' and scolds him for yet " - 'another needless Vietnam rant. Walter hugs him and tells him to "Fuck it, ' - 'man; let\'s go bowling." The Dude eases down.\r\n' - '\r\n' - 'At the bowling alley, the Stranger sits at the bar as the Dude orders two ' - "beers. They greet each other and the Stranger asks how he's been doing. " - '"Oh, you know, strikes and gutters, ups and downs," answers The Dude as he ' - 'collects his beers and goes to leave. The Stranger tells him to take it easy ' - 'and The Dude turns to reply, "Yeah, well, The Dude abides."\r\n' - '\r\n' - 'The Stranger finds comfort in those words and rambles about how things seem ' - 'to have turned out fine for Dude and Walter. He was sad to see Donny go but ' - "happens to know that there's a little Lebowski on the way. He assures us " - "that The Dude is always out there taking it easy for 'all us sinners' and " - 'orders another sarsaparilla. \r\n' - '\r\n') - ('Dude agrees to meet with the Big Lebowski, hoping to get compensation for ' - 'his rug since it "really tied the room together" and figures that his wife, ' - "Bunny, shouldn't be owing money around town.\n" - 'Walter resolves to go to Plan B; he tells Larry to watch out the window as ' - 'he and Dude go back out to the car where Donny is waiting.') - 'dude\ndudes\nwalter\nlebowski\nbrandt\nmaude\ndonny\nbunny' - - -This time around, the summary is not of high quality, as it does not tell us -much about the movie. In a way, this might not be the algorithms fault, -rather this text simply doesn't contain one or two sentences that capture the -essence of the text as in "The Matrix" synopsis. - -The keywords, however, managed to find some of the main characters. - -Performance ------------ - -We will test how the speed of the summarizer scales with the size of the -dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 -processor. Note that the summarizer does **not** support multithreading -(parallel processing). - -The tests were run on the book "Honest Abe" by Alonzo Rothschild. Download -the book in plain-text `here `__. - -In the **plot below** , we see the running times together with the sizes of -the datasets. To create datasets of different sizes, we have simply taken -prefixes of text; in other words we take the first **n** characters of the -book. The algorithm seems to be **quadratic in time** , so one needs to be -careful before plugging a large dataset into the summarizer. - - -.. code-block:: default - - - import matplotlib.pyplot as plt - import matplotlib.image as mpimg - img = mpimg.imread('summarization_tutorial_plot.png') - imgplot = plt.imshow(img) - plt.axis('off') - plt.show() - - - - -.. image:: /auto_examples/tutorials/images/sphx_glr_run_summarization_001.png - :class: sphx-glr-single-img - - - - -Text-content dependent running times ------------------------------------- - -The running time is not only dependent on the size of the dataset. For -example, summarizing "The Matrix" synopsis (about 36,000 characters) takes -about 3.1 seconds, while summarizing 35,000 characters of this book takes -about 8.5 seconds. So the former is **more than twice as fast**. - -One reason for this difference in running times is the data structure that is -used. The algorithm represents the data using a graph, where vertices (nodes) -are sentences, and then constructs weighted edges between the vertices that -represent how the sentences relate to each other. This means that every piece -of text will have a different graph, thus making the running times different. -The size of this data structure is **quadratic in the worst case** (the worst -case is when each vertex has an edge to every other vertex). - -Another possible reason for the difference in running times is that the -problems converge at different rates, meaning that the error drops slower for -some datasets than for others. - -Montemurro and Zanette's entropy based keyword extraction algorithm -------------------------------------------------------------------- - -`This paper `__ describes a technique to -identify words that play a significant role in the large-scale structure of a -text. These typically correspond to the major themes of the text. The text is -divided into blocks of ~1000 words, and the entropy of each word's -distribution amongst the blocks is caclulated and compared with the expected -entropy if the word were distributed randomly. - - - -.. code-block:: default - - - - import requests - from gensim.summarization import mz_keywords - - text=requests.get("http://www.gutenberg.org/files/49679/49679-0.txt").text - print(mz_keywords(text,scores=True,threshold=0.001)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - [('i', 0.005071990145676084), - ('the', 0.004078714811925573), - ('lincoln', 0.003834207719481631), - ('you', 0.00333099434510635), - ('gutenberg', 0.0032861719465446127), - ('v', 0.0031486824001772298), - ('a', 0.0030225302081737385), - ('project', 0.003013787365092158), - ('s', 0.002804807648086567), - ('iv', 0.0027211423370182043), - ('he', 0.0026652557966447303), - ('ii', 0.002522584294510855), - ('his', 0.0021025932276434807), - ('by', 0.002092414407555808), - ('abraham', 0.0019871796860869762), - ('or', 0.0019180648459331258), - ('lincolna', 0.0019090487448340699), - ('tm', 0.001887549850538215), - ('iii', 0.001883132631521375), - ('was', 0.0018691721439371533), - ('work', 0.0017383218152950376), - ('new', 0.0016870325205805429), - ('co', 0.001654497521737427), - ('case', 0.0015991334540419223), - ('court', 0.0014413967155396973), - ('york', 0.001429133695025362), - ('on', 0.0013292841806795005), - ('it', 0.001308454011675044), - ('had', 0.001298103630126742), - ('to', 0.0012629182579600709), - ('my', 0.0012128129312019202), - ('of', 0.0011777988172289335), - ('life', 0.0011535688244729756), - ('their', 0.001149309335387912), - ('_works_', 0.0011438603236858932), - ('him', 0.0011391497955931084), - ('that', 0.0011069446497089712), - ('and', 0.0011027930360212363), - ('herndon', 0.0010518263812615242)] - - -By default, the algorithm weights the entropy by the overall frequency of the -word in the document. We can remove this weighting by setting weighted=False - - - -.. code-block:: default - - print(mz_keywords(text,scores=True,weighted=False,threshold=1.0)) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - [('gutenberg', 3.813054848640599), - ('project', 3.573855036862196), - ('tm', 3.5734630161654266), - ('co', 3.188187179789419), - ('foundation', 2.9349504275296248), - ('dogskin', 2.767166394411781), - ('electronic', 2.712759445340285), - ('donations', 2.5598097474452906), - ('foxboro', 2.552819829558231), - ('access', 2.534996621584064), - ('gloves', 2.534996621584064), - ('_works_', 2.519083905903437), - ('iv', 2.4068950059833725), - ('v', 2.376066199199476), - ('license', 2.32674033665853), - ('works', 2.320294093790008), - ('replacement', 2.297629530050557), - ('e', 2.1840002559354215), - ('coon', 2.1754936158294536), - ('volunteers', 2.1754936158294536), - ('york', 2.172102058646223), - ('ii', 2.143421998464259), - ('edited', 2.110161739139703), - ('refund', 2.100145067024387), - ('iii', 2.052633589900031), - ('bounded', 1.9832369322912882), - ('format', 1.9832369322912882), - ('jewelry', 1.9832369322912882), - ('metzker', 1.9832369322912882), - ('millions', 1.9832369322912882), - ('ragsdale', 1.9832369322912882), - ('specie', 1.9832369322912882), - ('archive', 1.9430792440279312), - ('reminiscences', 1.9409656357162346), - ('agreement', 1.933113430461269), - ('bonds', 1.90404582584515), - ('ebooks', 1.90404582584515), - ('jewelersa', 1.90404582584515), - ('brokaw', 1.9027974079098768), - ('ebook', 1.8911101680056084), - ('trademark', 1.8911101680056084), - ('parker', 1.8903494446079012), - ('almanac', 1.8267945764711788), - ('ross', 1.771449419244092), - ('work', 1.7368893093546554), - ('college', 1.72245395873311), - ('scott', 1.6666549709515948), - ('rothschild', 1.6615406993510273), - ('pglaf', 1.6528326283716357), - ('ana', 1.6345239955037414), - ('green', 1.634270040746932), - ('forquer', 1.6183315401308644), - ('improvementa', 1.6183315401308644), - ('hardin', 1.5967140500447887), - ('copyright', 1.5827844444400303), - ('houghton', 1.5827785818223203), - ('clair', 1.5757014351631946), - ('claya', 1.5757014351631946), - ('displaying', 1.5757014351631946), - ('fisher', 1.5757014351631946), - ('forgery', 1.5757014351631946), - ('holder', 1.5757014351631946), - ('ninea', 1.5757014351631946), - ('posted', 1.5757014351631946), - ('radford', 1.5757014351631946), - ('university', 1.5757014351631946), - ('wore', 1.5757014351631946), - ('_via_', 1.5752258220302042), - ('admissibility', 1.5752258220302042), - ('attire', 1.5752258220302042), - ('berries', 1.5752258220302042), - ('borrows', 1.5752258220302042), - ('breeches', 1.5752258220302042), - ('cline', 1.5752258220302042), - ('continuance', 1.5752258220302042), - ('currents', 1.5752258220302042), - ('daguerreotype', 1.5752258220302042), - ('disclaimer', 1.5752258220302042), - ('enrolled', 1.5752258220302042), - ('fool', 1.5752258220302042), - ('guineas', 1.5752258220302042), - ('hatchet', 1.5752258220302042), - ('instruct', 1.5752258220302042), - ('liability', 1.5752258220302042), - ('paullin', 1.5752258220302042), - ('performing', 1.5752258220302042), - ('polite', 1.5752258220302042), - ('religion', 1.5752258220302042), - ('rulings', 1.5752258220302042), - ('scammon', 1.5752258220302042), - ('tilda', 1.5752258220302042), - ('toma', 1.5752258220302042), - ('user', 1.5752258220302042), - ('wake', 1.5752258220302042), - ('warranties', 1.5752258220302042), - ('boston', 1.5614599080219351), - ('barrett', 1.5467512742732095), - ('lamon', 1.5401992915219354), - ('attitude', 1.5396869613721145), - ('life_', 1.5325431231066866), - ('chiniquy', 1.517252207711791), - ('bridge', 1.4987002321451297), - ('london', 1.4959606690277452), - ('pair', 1.4859741220167577), - ('banks', 1.4859741220167575), - ('abraham', 1.4788865317609083), - ('org', 1.4762084064880483), - ('literary', 1.4661381734947168), - ('bank', 1.460987504878338), - ('copy', 1.447991916287799), - ('railroad', 1.447589893332354), - ('armstrong', 1.4466729287651239), - ('rr', 1.414281759111378), - ('island', 1.410485371800411), - ('paragraph', 1.4097636251568062), - ('axe', 1.4028326283716357), - ('fence', 1.4028326283716357), - ('genuine', 1.4028326283716357), - ('journalism', 1.4028326283716357), - ('copies', 1.3883829009256057), - ('copper', 1.3883829009256057), - ('delegates', 1.3883829009256057), - ('distributing', 1.3883829009256057), - ('mifflin', 1.3883829009256057), - ('weekly_', 1.3883829009256057), - ('mother', 1.3721178797155553), - ('terms', 1.3614959149155839), - ('http', 1.3614628722331044), - ('historical', 1.3605563596000985), - ('publication', 1.3605563596000985), - ('provide', 1.360556359600098), - ('nicolay', 1.342899579830354), - ('p', 1.3384146299403934), - ('buckskin', 1.3266789355958883), - ('circular', 1.3266789355958883), - ('spink', 1.3266789355958883), - ('trunks', 1.3266789355958883), - ('generosity', 1.3223622526418946), - ('sells', 1.3183507586865963), - ('sons', 1.3183507586865963), - ('compliance', 1.3011906621704081), - ('crawford', 1.3011906621704081), - ('currency', 1.3011906621704081), - ('distribution', 1.3011906621704081), - ('frederick', 1.3011906621704081), - ('harvey', 1.3011906621704081), - ('individual', 1.3011906621704081), - ('massachusetts', 1.3011906621704081), - ('preacher', 1.3011906621704081), - ('priest', 1.3011906621704081), - ('scripps', 1.3011906621704081), - ('wona', 1.3011906621704081), - ('fee', 1.2951177274528036), - ('volumes', 1.2881294518121198), - ('baker', 1.2868805045464513), - ('river', 1.2845212649561222), - ('voyage', 1.2735521297403745), - ('tarbell', 1.2734860800899708), - ('browne', 1.2673814449958232), - ('herndon', 1.2611515180923591), - ('captain', 1.2566120240054834), - ('including', 1.2566120240054834), - ('she', 1.2523227962342451), - ('chicago', 1.2369612208874359), - ('company', 1.2280833162965425), - ('trade', 1.227264049589322), - ('publishing', 1.2222105265071501), - ('j', 1.20951426463863), - ('hanks', 1.2063558506421344), - ('cartwright', 1.2016275690670342), - ('judd', 1.2016275690670342), - ('mcclure', 1.2016275690670342), - ('permission', 1.2016275690670342), - ('sarah', 1.2016275690670342), - ('_the', 1.1993246703295348), - ('thomas', 1.192162263570947), - ('father', 1.182378500488939), - ('_weekly_', 1.1719588078321554), - ('_womana', 1.1719588078321554), - ('argue', 1.1719588078321554), - ('baddeley', 1.1719588078321554), - ('companion_', 1.1719588078321554), - ('copying', 1.1719588078321554), - ('crafton', 1.1719588078321554), - ('defect', 1.1719588078321554), - ('donate', 1.1719588078321554), - ('draft', 1.1719588078321554), - ('easier', 1.1719588078321554), - ('editions', 1.1719588078321554), - ('hammond', 1.1719588078321554), - ('hawley', 1.1719588078321554), - ('jake', 1.1719588078321554), - ('lightning', 1.1719588078321554), - ('paragraphs', 1.1719588078321554), - ('pg', 1.1719588078321554), - ('pork', 1.1719588078321554), - ('retains', 1.1719588078321554), - ('rod', 1.1719588078321554), - ('royalty', 1.1719588078321554), - ('securities', 1.1719588078321554), - ('shorter', 1.1719588078321554), - ('trousers', 1.1719588078321554), - ('unpublished', 1.1719588078321554), - ('agree', 1.1685160987957408), - ('moore', 1.1638374407328813), - ('brooks', 1.1590654105620253), - ('_early', 1.1547587616319834), - ('tarbella', 1.1547587616319834), - ('harrison', 1.1477375460464634), - ('kentucky', 1.1477375460464634), - ('dress', 1.1403494446079012), - ('german', 1.1403494446079012), - ('g', 1.1400041324991679), - ('you', 1.1197848310740541), - ('convention', 1.1170552756570524), - ('anecdotes', 1.1113491241476279), - ('deed', 1.10266861521132), - ('east', 1.10266861521132), - ('medium', 1.10266861521132), - ('spurious', 1.10266861521132), - ('stranger', 1.10266861521132), - ('atkinson', 1.1026686152113196), - ('comply', 1.1026686152113196), - ('witness', 1.0987403589682891), - ('rock', 1.0980116268282147), - ('biographical', 1.0936719125309864), - ('agent', 1.0936719125309862), - ('charter', 1.0936719125309862), - ('distribute', 1.0936719125309862), - ('_life_', 1.0861326250716679), - ('mississippi', 1.0861326250716679), - ('her', 1.0744523982065441), - ('james', 1.0718364842031898), - ('road', 1.0678271889746043), - ('january', 1.06299555570871), - ('plaintiff', 1.0622990427339003), - ('cents', 1.0601542260041765), - ('philadelphia', 1.054457748248602), - ('trailor', 1.054457748248602), - ('news', 1.0544577482486015), - ('guilty', 1.0523002937359087), - ('whitneya', 1.0523002937359087), - ('limited', 1.0523002937359083), - ('fees', 1.050421450259024), - ('f', 1.0470121250222224), - ('votes', 1.0462712423302567), - ('domain', 1.0459885068374677), - ('gentry', 1.0459885068374677), - ('grandfather', 1.0459885068374677), - ('voted', 1.0459885068374677), - ('speeches', 1.0440910909593955), - ('johnston', 1.0350643207520633), - ('swett', 1.0337988457068894), - ('john', 1.029145368980953), - ('note', 1.0290759889993701), - ('new', 1.0285274933806043), - ('d', 1.0276105644209155), - ('surveyor', 1.0234220417885176), - ('letter', 1.0221155682246605), - ('anecdote', 1.0217461799727077), - ('dungee', 1.0175064885113527), - ('notes', 1.015958543336191), - ('charles', 1.0118735044527019)] - - -When this option is used, it is possible to calculate a threshold -automatically from the number of blocks - - - -.. code-block:: default - - print(mz_keywords(text,scores=True,weighted=False,threshold="auto")) - - - - - -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - [('gutenberg', 3.813054848640599), - ('project', 3.573855036862196), - ('tm', 3.5734630161654266), - ('co', 3.188187179789419), - ('foundation', 2.9349504275296248), - ('dogskin', 2.767166394411781), - ('electronic', 2.712759445340285), - ('donations', 2.5598097474452906), - ('foxboro', 2.552819829558231), - ('access', 2.534996621584064), - ('gloves', 2.534996621584064), - ('_works_', 2.519083905903437), - ('iv', 2.4068950059833725), - ('v', 2.376066199199476), - ('license', 2.32674033665853), - ('works', 2.320294093790008), - ('replacement', 2.297629530050557), - ('e', 2.1840002559354215), - ('coon', 2.1754936158294536), - ('volunteers', 2.1754936158294536), - ('york', 2.172102058646223), - ('ii', 2.143421998464259), - ('edited', 2.110161739139703), - ('refund', 2.100145067024387), - ('iii', 2.052633589900031), - ('bounded', 1.9832369322912882), - ('format', 1.9832369322912882), - ('jewelry', 1.9832369322912882), - ('metzker', 1.9832369322912882), - ('millions', 1.9832369322912882), - ('ragsdale', 1.9832369322912882), - ('specie', 1.9832369322912882), - ('archive', 1.9430792440279312), - ('reminiscences', 1.9409656357162346), - ('agreement', 1.933113430461269), - ('bonds', 1.90404582584515), - ('ebooks', 1.90404582584515), - ('jewelersa', 1.90404582584515), - ('brokaw', 1.9027974079098768), - ('ebook', 1.8911101680056084), - ('trademark', 1.8911101680056084), - ('parker', 1.8903494446079012), - ('almanac', 1.8267945764711788), - ('ross', 1.771449419244092), - ('work', 1.7368893093546554), - ('college', 1.72245395873311), - ('scott', 1.6666549709515948), - ('rothschild', 1.6615406993510273), - ('pglaf', 1.6528326283716357), - ('ana', 1.6345239955037414), - ('green', 1.634270040746932), - ('forquer', 1.6183315401308644), - ('improvementa', 1.6183315401308644), - ('hardin', 1.5967140500447887), - ('copyright', 1.5827844444400303), - ('houghton', 1.5827785818223203), - ('clair', 1.5757014351631946), - ('claya', 1.5757014351631946), - ('displaying', 1.5757014351631946), - ('fisher', 1.5757014351631946), - ('forgery', 1.5757014351631946), - ('holder', 1.5757014351631946), - ('ninea', 1.5757014351631946), - ('posted', 1.5757014351631946), - ('radford', 1.5757014351631946), - ('university', 1.5757014351631946), - ('wore', 1.5757014351631946), - ('_via_', 1.5752258220302042), - ('admissibility', 1.5752258220302042), - ('attire', 1.5752258220302042), - ('berries', 1.5752258220302042), - ('borrows', 1.5752258220302042), - ('breeches', 1.5752258220302042), - ('cline', 1.5752258220302042), - ('continuance', 1.5752258220302042), - ('currents', 1.5752258220302042), - ('daguerreotype', 1.5752258220302042), - ('disclaimer', 1.5752258220302042), - ('enrolled', 1.5752258220302042), - ('fool', 1.5752258220302042), - ('guineas', 1.5752258220302042), - ('hatchet', 1.5752258220302042), - ('instruct', 1.5752258220302042), - ('liability', 1.5752258220302042), - ('paullin', 1.5752258220302042), - ('performing', 1.5752258220302042), - ('polite', 1.5752258220302042), - ('religion', 1.5752258220302042), - ('rulings', 1.5752258220302042), - ('scammon', 1.5752258220302042), - ('tilda', 1.5752258220302042), - ('toma', 1.5752258220302042), - ('user', 1.5752258220302042), - ('wake', 1.5752258220302042), - ('warranties', 1.5752258220302042), - ('boston', 1.5614599080219351), - ('barrett', 1.5467512742732095), - ('lamon', 1.5401992915219354), - ('attitude', 1.5396869613721145), - ('life_', 1.5325431231066866), - ('chiniquy', 1.517252207711791), - ('bridge', 1.4987002321451297), - ('london', 1.4959606690277452), - ('pair', 1.4859741220167577), - ('banks', 1.4859741220167575), - ('abraham', 1.4788865317609083), - ('org', 1.4762084064880483), - ('literary', 1.4661381734947168), - ('bank', 1.460987504878338), - ('copy', 1.447991916287799), - ('railroad', 1.447589893332354), - ('armstrong', 1.4466729287651239), - ('rr', 1.414281759111378), - ('island', 1.410485371800411), - ('paragraph', 1.4097636251568062), - ('axe', 1.4028326283716357), - ('fence', 1.4028326283716357), - ('genuine', 1.4028326283716357), - ('journalism', 1.4028326283716357), - ('copies', 1.3883829009256057), - ('copper', 1.3883829009256057), - ('delegates', 1.3883829009256057), - ('distributing', 1.3883829009256057), - ('mifflin', 1.3883829009256057), - ('weekly_', 1.3883829009256057), - ('mother', 1.3721178797155553), - ('terms', 1.3614959149155839), - ('http', 1.3614628722331044), - ('historical', 1.3605563596000985), - ('publication', 1.3605563596000985), - ('provide', 1.360556359600098), - ('nicolay', 1.342899579830354), - ('p', 1.3384146299403934), - ('buckskin', 1.3266789355958883), - ('circular', 1.3266789355958883), - ('spink', 1.3266789355958883), - ('trunks', 1.3266789355958883), - ('generosity', 1.3223622526418946), - ('sells', 1.3183507586865963), - ('sons', 1.3183507586865963), - ('compliance', 1.3011906621704081), - ('crawford', 1.3011906621704081), - ('currency', 1.3011906621704081), - ('distribution', 1.3011906621704081), - ('frederick', 1.3011906621704081), - ('harvey', 1.3011906621704081), - ('individual', 1.3011906621704081), - ('massachusetts', 1.3011906621704081), - ('preacher', 1.3011906621704081), - ('priest', 1.3011906621704081), - ('scripps', 1.3011906621704081), - ('wona', 1.3011906621704081), - ('fee', 1.2951177274528036), - ('volumes', 1.2881294518121198), - ('baker', 1.2868805045464513), - ('river', 1.2845212649561222), - ('voyage', 1.2735521297403745), - ('tarbell', 1.2734860800899708), - ('browne', 1.2673814449958232), - ('herndon', 1.2611515180923591), - ('captain', 1.2566120240054834), - ('including', 1.2566120240054834), - ('she', 1.2523227962342451), - ('chicago', 1.2369612208874359), - ('company', 1.2280833162965425), - ('trade', 1.227264049589322), - ('publishing', 1.2222105265071501), - ('j', 1.20951426463863), - ('hanks', 1.2063558506421344), - ('cartwright', 1.2016275690670342), - ('judd', 1.2016275690670342), - ('mcclure', 1.2016275690670342), - ('permission', 1.2016275690670342), - ('sarah', 1.2016275690670342), - ('_the', 1.1993246703295348), - ('thomas', 1.192162263570947), - ('father', 1.182378500488939), - ('_weekly_', 1.1719588078321554), - ('_womana', 1.1719588078321554), - ('argue', 1.1719588078321554), - ('baddeley', 1.1719588078321554), - ('companion_', 1.1719588078321554), - ('copying', 1.1719588078321554), - ('crafton', 1.1719588078321554), - ('defect', 1.1719588078321554), - ('donate', 1.1719588078321554), - ('draft', 1.1719588078321554), - ('easier', 1.1719588078321554), - ('editions', 1.1719588078321554), - ('hammond', 1.1719588078321554), - ('hawley', 1.1719588078321554), - ('jake', 1.1719588078321554), - ('lightning', 1.1719588078321554), - ('paragraphs', 1.1719588078321554), - ('pg', 1.1719588078321554), - ('pork', 1.1719588078321554), - ('retains', 1.1719588078321554), - ('rod', 1.1719588078321554), - ('royalty', 1.1719588078321554), - ('securities', 1.1719588078321554), - ('shorter', 1.1719588078321554), - ('trousers', 1.1719588078321554), - ('unpublished', 1.1719588078321554), - ('agree', 1.1685160987957408), - ('moore', 1.1638374407328813), - ('brooks', 1.1590654105620253), - ('_early', 1.1547587616319834), - ('tarbella', 1.1547587616319834), - ('harrison', 1.1477375460464634), - ('kentucky', 1.1477375460464634), - ('dress', 1.1403494446079012), - ('german', 1.1403494446079012), - ('g', 1.1400041324991679), - ('you', 1.1197848310740541), - ('convention', 1.1170552756570524), - ('anecdotes', 1.1113491241476279), - ('deed', 1.10266861521132), - ('east', 1.10266861521132), - ('medium', 1.10266861521132), - ('spurious', 1.10266861521132), - ('stranger', 1.10266861521132), - ('atkinson', 1.1026686152113196), - ('comply', 1.1026686152113196), - ('witness', 1.0987403589682891), - ('rock', 1.0980116268282147), - ('biographical', 1.0936719125309864), - ('agent', 1.0936719125309862), - ('charter', 1.0936719125309862), - ('distribute', 1.0936719125309862), - ('_life_', 1.0861326250716679), - ('mississippi', 1.0861326250716679), - ('her', 1.0744523982065441), - ('james', 1.0718364842031898), - ('road', 1.0678271889746043), - ('january', 1.06299555570871), - ('plaintiff', 1.0622990427339003), - ('cents', 1.0601542260041765), - ('philadelphia', 1.054457748248602), - ('trailor', 1.054457748248602), - ('news', 1.0544577482486015), - ('guilty', 1.0523002937359087), - ('whitneya', 1.0523002937359087), - ('limited', 1.0523002937359083), - ('fees', 1.050421450259024), - ('f', 1.0470121250222224), - ('votes', 1.0462712423302567), - ('domain', 1.0459885068374677), - ('gentry', 1.0459885068374677), - ('grandfather', 1.0459885068374677), - ('voted', 1.0459885068374677), - ('speeches', 1.0440910909593955), - ('johnston', 1.0350643207520633), - ('swett', 1.0337988457068894), - ('john', 1.029145368980953), - ('note', 1.0290759889993701), - ('new', 1.0285274933806043), - ('d', 1.0276105644209155), - ('surveyor', 1.0234220417885176), - ('letter', 1.0221155682246605), - ('anecdote', 1.0217461799727077), - ('dungee', 1.0175064885113527), - ('notes', 1.015958543336191), - ('charles', 1.0118735044527019), - ('counterfeit', 0.999988304284928), - ('xvi', 0.999988304284928), - ('store', 0.9994804834557804), - ('_amount_', 0.9963302125628715), - ('_black', 0.9963302125628715), - ('_magazine', 0.9963302125628715), - ('_sun_', 0.9963302125628715), - ('adjourning', 0.9963302125628715), - ('advertiser', 0.9963302125628715), - ('advertisers', 0.9963302125628715), - ('agnosticism', 0.9963302125628715), - ('animals', 0.9963302125628715), - ('apparel', 0.9963302125628715), - ('appoints', 0.9963302125628715), - ('arbitrations', 0.9963302125628715), - ('ascii', 0.9963302125628715), - ('aspirants', 0.9963302125628715), - ('atrocious', 0.9963302125628715), - ('attracts', 0.9963302125628715), - ('authorsa', 0.9963302125628715), - ('band', 0.9963302125628715), - ('bargained', 0.9963302125628715), - ('battles', 0.9963302125628715), - ('bets', 0.9963302125628715), - ('bleeding', 0.9963302125628715), - ('boats', 0.9963302125628715), - ('book_', 0.9963302125628715), - ('boss', 0.9963302125628715), - ('bull', 0.9963302125628715), - ('calf', 0.9963302125628715), - ('chase', 0.9963302125628715), - ('chicanery', 0.9963302125628715), - ('coach', 0.9963302125628715), - ('comet', 0.9963302125628715), - ('computer', 0.9963302125628715), - ('computers', 0.9963302125628715), - ('concentration', 0.9963302125628715), - ('conquering', 0.9963302125628715), - ('conservator', 0.9963302125628715), - ('copied', 0.9963302125628715), - ('cord', 0.9963302125628715), - ('cornell', 0.9963302125628715), - ('countenance', 0.9963302125628715), - ('counting', 0.9963302125628715), - ('countryman', 0.9963302125628715), - ('creeks', 0.9963302125628715), - ('davy', 0.9963302125628715), - ('decatur', 0.9963302125628715), - ('deer', 0.9963302125628715), - ('defa', 0.9963302125628715), - ('delegations', 0.9963302125628715), - ('deliveries', 0.9963302125628715), - ('demurrer', 0.9963302125628715), - ('describing', 0.9963302125628715), - ('desires', 0.9963302125628715), - ('directors', 0.9963302125628715), - ('disallows', 0.9963302125628715), - ('disgracing', 0.9963302125628715), - ('doctoring', 0.9963302125628715), - ('dogskina', 0.9963302125628715), - ('effectively', 0.9963302125628715), - ('elections', 0.9963302125628715), - ('electronically', 0.9963302125628715), - ('employees', 0.9963302125628715), - ('emulates', 0.9963302125628715), - ('enrolling', 0.9963302125628715), - ('errands', 0.9963302125628715), - ('faded', 0.9963302125628715), - ('fergus', 0.9963302125628715), - ('flatboat', 0.9963302125628715), - ('forehead', 0.9963302125628715), - ('fort', 0.9963302125628715), - ('generals', 0.9963302125628715), - ('goose', 0.9963302125628715), - ('greed', 0.9963302125628715), - ('groomsman', 0.9963302125628715), - ('hagerty', 0.9963302125628715), - ('hans', 0.9963302125628715), - ('harvard', 0.9963302125628715), - ('haute', 0.9963302125628715), - ('heel', 0.9963302125628715), - ('history_', 0.9963302125628715), - ('homestead', 0.9963302125628715), - ('hut', 0.9963302125628715), - ('ice', 0.9963302125628715), - ('ida', 0.9963302125628715), - ('identical', 0.9963302125628715), - ('imperialist', 0.9963302125628715), - ('irons', 0.9963302125628715), - ('janet', 0.9963302125628715), - ('jr', 0.9963302125628715), - ('justification', 0.9963302125628715), - ('lambs', 0.9963302125628715), - ('latin', 0.9963302125628715), - ('linen', 0.9963302125628715), - ('louder', 0.9963302125628715), - ('mad', 0.9963302125628715), - ('madison', 0.9963302125628715), - ('maid', 0.9963302125628715), - ('martyr', 0.9963302125628715), - ('metaphysical', 0.9963302125628715), - ('mit', 0.9963302125628715), - ('monthlies', 0.9963302125628715), - ('moods', 0.9963302125628715), - ('moorea', 0.9963302125628715), - ('naed', 0.9963302125628715), - ('nest', 0.9963302125628715), - ('nigger', 0.9963302125628715), - ('package', 0.9963302125628715), - ('pan', 0.9963302125628715), - ('parentage', 0.9963302125628715), - ('partly', 0.9963302125628715), - ('passengers', 0.9963302125628715), - ('pastimes', 0.9963302125628715), - ('pla', 0.9963302125628715), - ('playful', 0.9963302125628715), - ('pony', 0.9963302125628715), - ('population', 0.9963302125628715), - ('postponed', 0.9963302125628715), - ('postponement', 0.9963302125628715), - ('premise', 0.9963302125628715), - ('pressure', 0.9963302125628715), - ('presumption', 0.9963302125628715), - ('preventing', 0.9963302125628715), - ('puffsa', 0.9963302125628715), - ('quart', 0.9963302125628715), - ('quincy', 0.9963302125628715), - ('quorum', 0.9963302125628715), - ('reckoneda', 0.9963302125628715), - ('redistribution', 0.9963302125628715), - ('registered', 0.9963302125628715), - ('remit', 0.9963302125628715), - ('rifle', 0.9963302125628715), - ('rothschild_', 0.9963302125628715), - ('rowa', 0.9963302125628715), - ('rubbish', 0.9963302125628715), - ('sacrifices', 0.9963302125628715), - ('scroll', 0.9963302125628715), - ('shade', 0.9963302125628715), - ('shed', 0.9963302125628715), - ('sigh', 0.9963302125628715), - ('silk', 0.9963302125628715), - ('sinewy', 0.9963302125628715), - ('sock', 0.9963302125628715), - ('solicit', 0.9963302125628715), - ('solvent', 0.9963302125628715), - ('sonny', 0.9963302125628715), - ('specified', 0.9963302125628715), - ('startling', 0.9963302125628715), - ('steals', 0.9963302125628715), - ('stevenson', 0.9963302125628715), - ('subpa', 0.9963302125628715), - ('subsequently', 0.9963302125628715), - ('surface', 0.9963302125628715), - ('tanned', 0.9963302125628715), - ('tea', 0.9963302125628715), - ('terre', 0.9963302125628715), - ('theosophy', 0.9963302125628715), - ('tight', 0.9963302125628715), - ('tis', 0.9963302125628715), - ('tour', 0.9963302125628715), - ('trailors', 0.9963302125628715), - ('vanilla', 0.9963302125628715), - ('vol', 0.9963302125628715), - ('warranty', 0.9963302125628715), - ('watkinsa', 0.9963302125628715), - ('wayne', 0.9963302125628715), - ('weekly', 0.9963302125628715), - ('whip', 0.9963302125628715), - ('woodcut', 0.9963302125628715), - ('wright', 0.9963302125628715)] - - -The complexity of the algorithm is **O**\ (\ *Nw*\ ), where *N* is the number -of words in the document and *w* is the number of unique words. - - - -.. rst-class:: sphx-glr-timing - - **Total running time of the script:** ( 0 minutes 16.214 seconds) - -**Estimated memory usage:** 15 MB - - -.. _sphx_glr_download_auto_examples_tutorials_run_summarization.py: - - -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download - - :download:`Download Python source code: run_summarization.py ` - - - - .. container:: sphx-glr-download - - :download:`Download Jupyter notebook: run_summarization.ipynb ` - - -.. only:: html - - .. rst-class:: sphx-glr-signature - - `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 8d1b5d70c6..af55f3f18a 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -7,22 +7,16 @@ Computation times ================= **00:07.863** total execution time for **auto_examples_tutorials** files: -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:07.863 | 37.1 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_distance_metrics.py` (``run_distance_metrics.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_pivoted_doc_norm.py` (``run_pivoted_doc_norm.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_summarization.py` (``run_summarization.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 00:00.000 | 0.0 MB | -+-----------------------------------------------------------------------------------------------+-----------+---------+ ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 14:40.672 | 752.8 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 00:00.000 | 0.0 MB | ++-----------------------------------------------------------------------------------------------+-----------+----------+ diff --git a/docs/src/conf.py b/docs/src/conf.py index 75fa293f14..13d26e254b 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -256,10 +256,8 @@ def sort_key(source_dir): 'run_fasttext.py', 'run_annoy.py', 'run_lda.py', - 'run_distance_metrics.py', 'run_wmd.py', 'run_summarization.py', - 'run_pivoted_doc_norm.py', ] howto_order = [ diff --git a/docs/src/gallery/other/README.txt b/docs/src/gallery/other/README.txt index d48ee9023d..5b23b3c20f 100644 --- a/docs/src/gallery/other/README.txt +++ b/docs/src/gallery/other/README.txt @@ -19,7 +19,6 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from - ? `Colouring words by topic in a document, print words in a topics `__ - ? `Topic Coherence, a metric that correlates that human judgement on topic quality. `__ - - ? `Compare topics and documents using Jaccard, Kullback-Leibler and Hellinger similarities `__ - ? `America's Next Topic Model slides `__ - How to choose your next topic model, presented at Pydata Berlin 10 August 2016 by Lev Konstantinovsky - ? `Dynamic Topic Modeling and Dynamic Influence Model Tutorial `__ diff --git a/docs/src/gallery/tutorials/run_distance_metrics.py b/docs/src/gallery/tutorials/run_distance_metrics.py deleted file mode 100644 index 30567500ba..0000000000 --- a/docs/src/gallery/tutorials/run_distance_metrics.py +++ /dev/null @@ -1,337 +0,0 @@ -r""" -Distance Metrics -================ - -Introduces the concept of distance between document representations, and demonstrates its calculation using Gensim. - -""" - -import logging -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -############################################################################### -# If you simply want to calculate the similarity between documents, then you -# may want to check out the `Similarity Queries Tutorial -# `_ and the `API reference for similarities -# `_. The current -# tutorial shows the building block of these larger methods, which are a small -# suite of distance metrics. -# -# Here's a brief summary of this tutorial: -# -# 1. Set up a small corpus consisting of documents belonging to one of two topics -# 2. Train an LDA model to distinguish between the two topics -# 3. Use the model to obtain distributions for some sample words -# 4. Compare the distributions to each other using a variety of distance metrics: -# -# * Hellinger distance -# * Jaccard coefficient -# -# 5. Discuss the concept of distance metrics in slightly more detail -# -from gensim.corpora import Dictionary - -# you can use any corpus, this is just illustratory -texts = [ - ['bank', 'river', 'shore', 'water'], - ['river', 'water', 'flow', 'fast', 'tree'], - ['bank', 'water', 'fall', 'flow'], - ['bank', 'bank', 'water', 'rain', 'river'], - ['river', 'water', 'mud', 'tree'], - ['money', 'transaction', 'bank', 'finance'], - ['bank', 'borrow', 'money'], - ['bank', 'finance'], - ['finance', 'money', 'sell', 'bank'], - ['borrow', 'sell'], - ['bank', 'loan', 'sell'], -] - -dictionary = Dictionary(texts) -corpus = [dictionary.doc2bow(text) for text in texts] - -import numpy -numpy.random.seed(1) # setting random seed to get the same results each time. - -from gensim.models import ldamodel -model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=2, minimum_probability=1e-8) -model.show_topics() - -############################################################################### -# Let's call the 1st topic the **water** topic and the second topic the **finance** topic. -# -# Let's take a few sample documents and get them ready to test our distance functions. -# -doc_water = ['river', 'water', 'shore'] -doc_finance = ['finance', 'money', 'sell'] -doc_bank = ['finance', 'bank', 'tree', 'water'] - -# Now let's transform these into a bag of words format. -bow_water = model.id2word.doc2bow(doc_water) -bow_finance = model.id2word.doc2bow(doc_finance) -bow_bank = model.id2word.doc2bow(doc_bank) - -# We can now get the LDA topic distributions for these. -lda_bow_water = model[bow_water] -lda_bow_finance = model[bow_finance] -lda_bow_bank = model[bow_bank] - -############################################################################### -# Hellinger -# --------- -# -# We're now ready to apply our distance metrics. -# These metrics return a value between 0 and 1, where values closer to 0 indicate a -# smaller distance and therefore a larger similarity. -# -# Let's start with the popular Hellinger distance. -# -# The Hellinger distance metric is symmetric and gives an output in the range [0,1] -# for two probability distributions. Values closer to 0 mean "more similar". -# -from gensim.matutils import hellinger -print(hellinger(lda_bow_water, lda_bow_finance)) -print(hellinger(lda_bow_finance, lda_bow_bank)) - -############################################################################### -# Makes sense, right? In the first example, Document 1 and Document 2 are hardly similar, so we get a value of roughly 0.5. -# -# In the second case, the documents are a lot more semantically similar, so their distance is lower. -# - -############################################################################### -# -# In our previous examples we saw that there were lower distance values between -# ``bank`` and ``finance`` than for ``bank`` and ``water``, even if it wasn't by a huge margin. -# What does this mean? -# -# The ``bank`` document is a combination of both water and finance related -# terms - but as bank in this context is likely to belong to the finance topic, -# the distance values are less between the finance and bank bows. -# - -# just to confirm our suspicion that the bank bow is more to do with finance: -model.get_document_topics(bow_bank) - -############################################################################### -# -# It's evident that while it isn't too skewed, it it more towards the finance topic. -# - -############################################################################### -# Jaccard coefficient -# ------------------- -# -# Let's now look at the `Jaccard Distance -# `_ (also Jaccard index, Jaccard coefficient) -# for calculating the similarity between two documents represented as two bags-of-words vectors. -# -from gensim.matutils import jaccard - -print(jaccard(bow_water, bow_bank)) -print(jaccard(doc_water, doc_bank)) -print(jaccard(['word'], ['word'])) - -############################################################################### -# The three examples above feature 2 different input methods. -# -# In the first case, we present document vectors already in bag of -# words format. The distance can be defined as 1 minus the size of the -# intersection upon the size of the union of the vectors. -# -# We can see (on manual inspection as well), that the distance is likely to be -# high - and it is. -# -# The last two examples illustrate the ability for Jaccard distance to accept even lists -# of words (i.e, documents) as inputs. -# -# In the last case, because they are the same vectors, so the value returned is 0 -# - this means the distance is 0 and the two documents are identical. -# - -############################################################################### -# -# Distance Metrics for Topic Distributions -# ---------------------------------------- -# -# While there are already standard methods to identify similarity of documents, -# our distance metrics has one more interesting use-case: topic distributions. -# -# Let's say we want to find out how similar our two topics are, ``water`` and ``finance``. -# -topic_water, topic_finance = model.show_topics() - -# Preprocess to get the topics in a format accepted by our distance metric functions. - -def parse_topic_string(topic): - """Split a string returned by model.show_topics() into topics and their probabilities.""" - topic = topic.split('+') - topic_bow = [] - for word in topic: - # split the probability from word - prob, word = word.split('*') - # get rid of spaces and quote marks - word = word.replace(" ", "").replace('"', '') - # convert the word (string) to its dictionary index (int) - word = model.id2word.token2id[word] - topic_bow.append((word, float(prob))) - return topic_bow - -finance_distribution = parse_topic_string(topic_finance[1]) -water_distribution = parse_topic_string(topic_water[1]) - -# the finance topic in the bag-of-words format looks like this: -print(finance_distribution) - -############################################################################### -# Now that we've got our topics in a format acceptable by our functions, -# let's use a Distance metric to see how similar the word distributions in the -# topics are. -# -print(hellinger(water_distribution, finance_distribution)) - -############################################################################### -# Our value of roughly 0.36 means that the topics are not TOO distant with -# respect to their word distributions. -# -# This makes sense again, because of overlapping words like ``bank`` and a -# small size dictionary. -# - - -############################################################################### -# What are Distance Metrics? -# -------------------------- -# -# Having seen the practical usages of these measures (i.e, to find similarity), -# let's learn a little about what exactly Distance Measures and Metrics are. -# -# There -# are 4 conditons for for a distance measure to be a metric: -# -# 1. d(x,y) >= 0 -# 2. d(x,y) = 0 <=> x = y -# 3. d(x,y) = d(y,x) -# 4. d(x,z) <= d(x,y) + d(y,z) -# -# That is: it must be non-negative; if x and y are the same, distance must be -# zero; it must be symmetric; and it must obey the triangle inequality law. -# -# Simple enough, right? -# -# Let's test these out for our measures. -# - -# ormal Hellinger distance. -a = hellinger(water_distribution, finance_distribution) -b = hellinger(finance_distribution, water_distribution) -print(a) -print(b) -print(a == b) - -# If we pass the same values, it is zero. -print(hellinger(water_distribution, water_distribution)) - -# For triangle inequality let's use LDA document distributions. -print(hellinger(lda_bow_finance, lda_bow_bank)) - -# Triangle inequality works too! -print(hellinger(lda_bow_finance, lda_bow_water) + hellinger(lda_bow_water, lda_bow_bank)) - - -# For a nice review of the mathematical differences between the Hellinger distance and -# Kullback-Leibler divergence, see for example `here -# `__. -# - - -############################################################################### -# Visualizing Distance Metrics -# ---------------------------- -# -# Let's plot a graph of our toy dataset using the popular `networkx -# `_ library. -# -# Each node will be a document, where the color of the node will be its topic -# according to the LDA model. Edges will connect documents to each other, where -# the *weight* of the edge will be inversely proportional to the Jaccard -# similarity between two documents. We will also annotate the edges to further -# aid visualization: **strong** edges will connect similar documents, and -# **weak (dashed)** edges will connect dissimilar documents. -# -# In summary, similar documents will be closer together, different documents -# will be further apart. -# -import itertools -import networkx as nx - -def get_most_likely_topic(doc): - bow = model.id2word.doc2bow(doc) - topics, probabilities = zip(*model.get_document_topics(bow)) - max_p = max(probabilities) - topic = topics[probabilities.index(max_p)] - return topic - -def get_node_color(i): - return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink' - -G = nx.Graph() -for i, _ in enumerate(texts): - G.add_node(i) - -for (i1, i2) in itertools.combinations(range(len(texts)), 2): - bow1, bow2 = texts[i1], texts[i2] - distance = jaccard(bow1, bow2) - G.add_edge(i1, i2, weight=1/distance) - -# -# https://networkx.github.io/documentation/networkx-1.9/examples/drawing/weighted_graph.html -# -pos = nx.spring_layout(G) - -threshold = 1.25 -elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > threshold] -esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= threshold] - -node_colors = [get_node_color(i) for (i, _) in enumerate(texts)] -nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors) -nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2) -nx.draw_networkx_edges(G, pos, edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed') -nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif') - -############################################################################### -# We can make several observations from this graph. -# -# First, the graph consists of two connected components (if you ignore the weak edges). -# Nodes 0, 1, 2, 3, 4 (which all belong to the water topic) form the first connected component. -# The other nodes, which all belong to the finance topic, form the second connected component. -# -# Second, the LDA model didn't do a very good job of classifying our documents into topics. -# There were many misclassifications, as you can confirm in the summary below: -# -print('id\ttopic\tdoc') -for i, t in enumerate(texts): - print(f'{i}\t{get_most_likely_topic(t)}\t{" ".join(t)}') - -############################################################################### -# This is mostly because the corpus used to train the LDA model is so small. -# Using a larger corpus should hopefully give better results, but that is beyond -# the scope of this tutorial. -# -# Conclusion -# ---------- -# -# That brings us to the end of this small tutorial. -# To recap, here's what we covered: -# -# 1. Set up a small corpus consisting of documents belonging to one of two topics -# 2. Train an LDA model to distinguish between the two topics -# 3. Use the model to obtain distributions for some sample words -# 4. Compare the distributions to each other using the distance metrics of Hellinger distance and Jaccard index -# 5. Discuss the concept of distance metrics in slightly more detail -# -# The scope for adding new similarity metrics is large, as there exist an even -# larger suite of metrics and methods to add to the matutils.py file. -# For more details, see `Similarity Measures for Text Document Clustering -# `_ -# by A. Huang. - diff --git a/docs/src/gallery/tutorials/run_pivoted_doc_norm.py b/docs/src/gallery/tutorials/run_pivoted_doc_norm.py deleted file mode 100644 index 9570595a54..0000000000 --- a/docs/src/gallery/tutorials/run_pivoted_doc_norm.py +++ /dev/null @@ -1,243 +0,0 @@ -r""" -Pivoted Document Length Normalization -===================================== - -This tutorial demonstrates using Pivoted Document Length Normalization to -counter the effect of short document bias when working with TfIdf, thereby -increasing classification accuracy. -""" - -############################################################################### -# In many cases, normalizing the tfidf weights for each term favors weights of terms -# of the documents with shorter length. The *pivoted document length normalization* scheme -# counters the effect of this bias for short documents, by making tfidf independent of the document length. -# -# This is achieved by *tilting* the normalization curve along a pivot point and slope, which -# must be defined by the user. -# -# Roughly following the equation: -# -# ``pivoted_norm = (1 - slope) * pivot + slope * old_norm`` -# -# This scheme is proposed in the paper `Pivoted Document Length Normalization `_ -# by Singhal, Buckley and Mitra. -# -# Overall this approach can increase the accuracy of the model where document lengths are hugely varying across the corpus. -# -# Introduction -# ------------ -# -# This guide demonstrates how to perform pivoted document length normalization. -# -# We will train a logistic regression model to distinguish between text from two different newsgroups. -# -# Our results will show that using pivoted document length normalization yields a better model (higher classification accuracy). -# - -# -# Download our dataset -# -import gensim.downloader as api -nws = api.load("20-newsgroups") - -# -# Pick texts from relevant newsgroups, split into training and test set. -# -cat1, cat2 = ('sci.electronics', 'sci.space') - -# -# X_* contain the actual texts as strings. -# Y_* contain labels, 0 for cat1 (sci.electronics) and 1 for cat2 (sci.space) -# -X_train = [] -X_test = [] -y_train = [] -y_test = [] - -for i in nws: - if i["set"] == "train" and i["topic"] == cat1: - X_train.append(i["data"]) - y_train.append(0) - elif i["set"] == "train" and i["topic"] == cat2: - X_train.append(i["data"]) - y_train.append(1) - elif i["set"] == "test" and i["topic"] == cat1: - X_test.append(i["data"]) - y_test.append(0) - elif i["set"] == "test" and i["topic"] == cat2: - X_test.append(i["data"]) - y_test.append(1) - -############################################################################### -# Preprocess the data -# -from gensim.parsing.preprocessing import preprocess_string -from gensim.corpora import Dictionary - -id2word = Dictionary([preprocess_string(doc) for doc in X_train]) -train_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_train] -test_corpus = [id2word.doc2bow(preprocess_string(doc)) for doc in X_test] - -print(len(X_train), len(X_test)) - -# We perform our analysis on top k documents which is almost top 10% most scored documents -k = len(X_test) // 10 - -############################################################################### -# Prepare our evaluation function -# -from gensim.sklearn_api.tfidf import TfIdfTransformer -from sklearn.linear_model import LogisticRegression -from gensim.matutils import corpus2csc - -def get_tfidf_scores(kwargs): - """ - Return a model's accuracy along with individual document probability values, using - Gensim's TfIdfTransformer and sklearn's LogisticRegression. - - """ - tfidf_transformer = TfIdfTransformer(**kwargs).fit(train_corpus) - - X_train_tfidf = corpus2csc(tfidf_transformer.transform(train_corpus), num_terms=len(id2word)).T - X_test_tfidf = corpus2csc(tfidf_transformer.transform(test_corpus), num_terms=len(id2word)).T - - clf = LogisticRegression().fit(X_train_tfidf, y_train) - - model_accuracy = clf.score(X_test_tfidf, y_test) - doc_scores = clf.decision_function(X_test_tfidf) - - return model_accuracy, doc_scores - -############################################################################### -# Get TFIDF scores for corpus without pivoted document length normalisation -# ------------------------------------------------------------------------- -# -params = {} -model_accuracy, doc_scores = get_tfidf_scores(params) -print(model_accuracy) - -############################################################################### -# Examine the bias towards shorter documents -import numpy as np - -# Sort the document scores by their scores and return a sorted list -# of document score and corresponding document lengths. -def sort_length_by_score(doc_scores, X_test): - doc_scores = sorted(enumerate(doc_scores), key=lambda x: x[1]) - doc_leng = np.empty(len(doc_scores)) - - ds = np.empty(len(doc_scores)) - - for i, _ in enumerate(doc_scores): - doc_leng[i] = len(X_test[_[0]]) - ds[i] = _[1] - - return ds, doc_leng - - -print( - f"Normal cosine normalisation favors short documents as our top {k} docs have a smaller " - f"mean doc length of {sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} " - f"compared to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" -) - -############################################################################### -# Get TFIDF scores for corpus with pivoted document length normalisation -# ---------------------------------------------------------------------- -# -# Test various values of alpha (slope) and pick the best one. -best_model_accuracy = 0 -optimum_slope = 0 -for slope in np.arange(0, 1.1, 0.1): - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - print(f"Score for slope {slope} is {model_accuracy}") - -print(f"We get best score of {best_model_accuracy} at slope {optimum_slope}") - -############################################################################### -# Evaluate the model with optimum slope -# -params = {"pivot": 10, "slope": optimum_slope} -model_accuracy, doc_scores = get_tfidf_scores(params) -print(model_accuracy) - -print( - f"With pivoted normalisation top {k} docs have a mean length of " - f"{sort_length_by_score(doc_scores, X_test)[1][:k].mean():.3f} which is much " - f"closer to the corpus mean doc length of {sort_length_by_score(doc_scores, X_test)[1].mean():.3f}" -) - -############################################################################### -# -# Visualizing the pivoted normalization -# ------------------------------------- -# -# From the plot we can see that when the slope was 1 (i.e. when pivoted normalisation -# was not applied at all), short documents with length of around 500 had very good scores. -# This is a bias for short documents. As we varied the value of slope from 1 to 0 -# we introdcued a new bias for long documents to counter the bias caused by -# cosine normalisation. At a certain point we got an optimum value of -# slope (0.5 here) where the overall accuracy of the model was maximized. -# -import matplotlib.pyplot as py - -best_model_accuracy = 0 -optimum_slope = 0 - -w = 2 -h = 2 -f, axarr = py.subplots(h, w, figsize=(15, 7)) - -it = 0 -for slope in [1, 0.2]: - params = {"pivot": 10, "slope": slope} - - model_accuracy, doc_scores = get_tfidf_scores(params) - - if model_accuracy > best_model_accuracy: - best_model_accuracy = model_accuracy - optimum_slope = slope - - doc_scores, doc_leng = sort_length_by_score(doc_scores, X_test) - - y = abs(doc_scores[:k, np.newaxis]) - x = doc_leng[:k, np.newaxis] - - py.subplot(1, 2, it+1).bar(x, y, width=20, linewidth=0) - py.title(f"Slope = {slope} Model accuracy = {model_accuracy}") - py.ylim([0, 4.5]) - py.xlim([0, 3200]) - py.xlabel("document length") - py.ylabel("confidence score") - - it += 1 - -py.tight_layout() -py.show() - -############################################################################### -# The above histogram plot helps us visualize the effect of ``slope``. For top -# k documents we have document length on the x axis and their respective scores -# of belonging to a specific class on y axis. -# -# As we decrease the slope the density of bins is shifted from low document -# length (around ~250-500) to over ~500 document length. This suggests that the -# positive biasness which was seen at ``slope=1`` (or when regular tfidf was -# used) for short documents is now reduced. We get the optimum slope or the max -# model accuracy when slope is 0.2. -# -# Conclusion -# ========== -# -# Using pivoted document normalization improved the classification accuracy a little bit: -# -# * Before (slope=1, identical to default cosine normalization): 0.9682 -# * After (slope=0.2): 0.9771 -# diff --git a/docs/src/gallery/tutorials/run_summarization.py b/docs/src/gallery/tutorials/run_summarization.py deleted file mode 100644 index e5281e1a9b..0000000000 --- a/docs/src/gallery/tutorials/run_summarization.py +++ /dev/null @@ -1,243 +0,0 @@ -r""" -Text Summarization -================== - -Demonstrates summarizing text by extracting the most important sentences from it. - -""" -import logging -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - -############################################################################### -# This module automatically summarizes the given text, by extracting one or -# more important sentences from the text. In a similar way, it can also extract -# keywords. This tutorial will teach you to use this summarization module via -# some examples. First, we will try a small example, then we will try two -# larger ones, and then we will review the performance of the summarizer in -# terms of speed. -# -# This summarizer is based on the , from an `"TextRank" algorithm by Mihalcea -# et al `_. -# This algorithm was later improved upon by `Barrios et al. -# `_, -# by introducing something called a "BM25 ranking function". -# -# .. important:: -# Gensim's summarization only works for English for now, because the text -# is pre-processed so that stopwords are removed and the words are stemmed, -# and these processes are language-dependent. -# -# Small example -# ------------- -# -# First of all, we import the :py:func:`gensim.summarization.summarize` function. - - -from pprint import pprint as print -from gensim.summarization import summarize - -############################################################################### -# We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example. -# - - -text = ( - "Thomas A. Anderson is a man living two lives. By day he is an " - "average computer programmer and by night a hacker known as " - "Neo. Neo has always questioned his reality, but the truth is " - "far beyond his imagination. Neo finds himself targeted by the " - "police when he is contacted by Morpheus, a legendary computer " - "hacker branded a terrorist by the government. Morpheus awakens " - "Neo to the real world, a ravaged wasteland where most of " - "humanity have been captured by a race of machines that live " - "off of the humans' body heat and electrochemical energy and " - "who imprison their minds within an artificial reality known as " - "the Matrix. As a rebel against the machines, Neo must return to " - "the Matrix and confront the agents: super-powerful computer " - "programs devoted to snuffing out Neo and the entire human " - "rebellion. " -) -print(text) - -############################################################################### -# To summarize this text, we pass the **raw string data** as input to the -# function "summarize", and it will return a summary. -# -# Note: make sure that the string does not contain any newlines where the line -# breaks in a sentence. A sentence with a newline in it (i.e. a carriage -# return, "\n") will be treated as two sentences. -# - -print(summarize(text)) - -############################################################################### -# -# Use the "split" option if you want a list of strings instead of a single string. -# -print(summarize(text, split=True)) - -############################################################################### -# -# You can adjust how much text the summarizer outputs via the "ratio" parameter -# or the "word_count" parameter. Using the "ratio" parameter, you specify what -# fraction of sentences in the original text should be returned as output. -# Below we specify that we want 50% of the original text (the default is 20%). -# - -print(summarize(text, ratio=0.5)) - -############################################################################### -# -# Using the "word_count" parameter, we specify the maximum amount of words we -# want in the summary. Below we have specified that we want no more than 50 -# words. -# -print(summarize(text, word_count=50)) - -############################################################################### -# As mentioned earlier, this module also supports **keyword** extraction. -# Keyword extraction works in the same way as summary generation (i.e. sentence -# extraction), in that the algorithm tries to find words that are important or -# seem representative of the entire text. They keywords are not always single -# words; in the case of multi-word keywords, they are typically all nouns. -# - -from gensim.summarization import keywords -print(keywords(text)) - -############################################################################### -# Larger example -# -------------- -# -# Let us try an example with a larger piece of text. We will be using a -# synopsis of the movie "The Matrix", which we have taken from `this -# `_ IMDb page. -# -# In the code below, we read the text file directly from a web-page using -# "requests". Then we produce a summary and some keywords. -# - - -import requests - -text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text -print(text) - -############################################################################### -# First, the summary -# -print(summarize(text, ratio=0.01)) - - -############################################################################### -# And now, the keywords: -# -print(keywords(text, ratio=0.01)) - -############################################################################### -# If you know this movie, you see that this summary is actually quite good. We -# also see that some of the most important characters (Neo, Morpheus, Trinity) -# were extracted as keywords. -# -# Another example -# --------------- -# -# Let's try an example similar to the one above. This time, we will use the IMDb synopsis -# `The Big Lebowski `_. -# -# Again, we download the text and produce a summary and some keywords. -# - - -text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text -print(text) -print(summarize(text, ratio=0.01)) -print(keywords(text, ratio=0.01)) - -############################################################################### -# This time around, the summary is not of high quality, as it does not tell us -# much about the movie. In a way, this might not be the algorithms fault, -# rather this text simply doesn't contain one or two sentences that capture the -# essence of the text as in "The Matrix" synopsis. -# -# The keywords, however, managed to find some of the main characters. -# -# Performance -# ----------- -# -# We will test how the speed of the summarizer scales with the size of the -# dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 -# processor. Note that the summarizer does **not** support multithreading -# (parallel processing). -# -# The tests were run on the book "Honest Abe" by Alonzo Rothschild. Download -# the book in plain-text `here `__. -# -# In the **plot below** , we see the running times together with the sizes of -# the datasets. To create datasets of different sizes, we have simply taken -# prefixes of text; in other words we take the first **n** characters of the -# book. The algorithm seems to be **quadratic in time** , so one needs to be -# careful before plugging a large dataset into the summarizer. - -import matplotlib.pyplot as plt -import matplotlib.image as mpimg -img = mpimg.imread('summarization_tutorial_plot.png') -imgplot = plt.imshow(img) -plt.axis('off') -plt.show() - -############################################################################### -# Text-content dependent running times -# ------------------------------------ -# -# The running time is not only dependent on the size of the dataset. For -# example, summarizing "The Matrix" synopsis (about 36,000 characters) takes -# about 3.1 seconds, while summarizing 35,000 characters of this book takes -# about 8.5 seconds. So the former is **more than twice as fast**. -# -# One reason for this difference in running times is the data structure that is -# used. The algorithm represents the data using a graph, where vertices (nodes) -# are sentences, and then constructs weighted edges between the vertices that -# represent how the sentences relate to each other. This means that every piece -# of text will have a different graph, thus making the running times different. -# The size of this data structure is **quadratic in the worst case** (the worst -# case is when each vertex has an edge to every other vertex). -# -# Another possible reason for the difference in running times is that the -# problems converge at different rates, meaning that the error drops slower for -# some datasets than for others. -# -# Montemurro and Zanette's entropy based keyword extraction algorithm -# ------------------------------------------------------------------- -# -# `This paper `__ describes a technique to -# identify words that play a significant role in the large-scale structure of a -# text. These typically correspond to the major themes of the text. The text is -# divided into blocks of ~1000 words, and the entropy of each word's -# distribution amongst the blocks is caclulated and compared with the expected -# entropy if the word were distributed randomly. -# - - -import requests -from gensim.summarization import mz_keywords - -text=requests.get("http://www.gutenberg.org/files/49679/49679-0.txt").text -print(mz_keywords(text,scores=True,threshold=0.001)) - -############################################################################### -# By default, the algorithm weights the entropy by the overall frequency of the -# word in the document. We can remove this weighting by setting weighted=False -# -print(mz_keywords(text,scores=True,weighted=False,threshold=1.0)) - -############################################################################### -# When this option is used, it is possible to calculate a threshold -# automatically from the number of blocks -# -print(mz_keywords(text,scores=True,weighted=False,threshold="auto")) - -############################################################################### -# The complexity of the algorithm is **O**\ (\ *Nw*\ ), where *N* is the number -# of words in the document and *w* is the number of unique words. -# diff --git a/docs/src/summarization/bm25.rst b/docs/src/summarization/bm25.rst deleted file mode 100644 index 2889788ee4..0000000000 --- a/docs/src/summarization/bm25.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.bm25` -- BM25 ranking function -========================================================= - -.. automodule:: gensim.summarization.bm25 - :synopsis: BM25 ranking function - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/commons.rst b/docs/src/summarization/commons.rst deleted file mode 100644 index b131002dd0..0000000000 --- a/docs/src/summarization/commons.rst +++ /dev/null @@ -1,8 +0,0 @@ -:mod:`summarization.commons` -- Graph functions used in TextRank summarization -============================================================================== - -.. automodule:: gensim.summarization.commons - :synopsis: Common graph functions used in TextRank summarization - :members: - :inherited-members: - :undoc-members: diff --git a/docs/src/summarization/graph.rst b/docs/src/summarization/graph.rst deleted file mode 100644 index 29167cc377..0000000000 --- a/docs/src/summarization/graph.rst +++ /dev/null @@ -1,8 +0,0 @@ -:mod:`summarization.graph` -- Graph used in TextRank summarization -================================================================== - -.. automodule:: gensim.summarization.graph - :synopsis: Graph utilities used in the TextRank summarization algorithm - :members: - :inherited-members: - :undoc-members: diff --git a/docs/src/summarization/keywords.rst b/docs/src/summarization/keywords.rst deleted file mode 100644 index 041c5dd10b..0000000000 --- a/docs/src/summarization/keywords.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.keywords` -- Keywords for TextRank summarization algorithm -============================================================================== - -.. automodule:: gensim.summarization.keywords - :synopsis: Keywords for TextRank summarization algorithm - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst deleted file mode 100644 index 31222ca6ab..0000000000 --- a/docs/src/summarization/mz_entropy.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.mz_entropy` -- Keywords for the Montemurro and Zanette entropy algorithm -============================================================================================ - -.. automodule:: gensim.summarization.mz_entropy - :synopsis: Keywords for the Montemurro and Zanette entropy algorithm - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/pagerank_weighted.rst b/docs/src/summarization/pagerank_weighted.rst deleted file mode 100644 index 0dd9638679..0000000000 --- a/docs/src/summarization/pagerank_weighted.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.pagerank_weighted` -- Weighted PageRank algorithm -===================================================================== - -.. automodule:: gensim.summarization.pagerank_weighted - :synopsis: Weighted PageRank algorithm - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/summariser.rst b/docs/src/summarization/summariser.rst deleted file mode 100644 index 15c0fa08f0..0000000000 --- a/docs/src/summarization/summariser.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.summarizer` -- TextRank Summarizer -====================================================== - -.. automodule:: gensim.summarization.summarizer - :synopsis: TextRank Summarizer - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/syntactic_unit.rst b/docs/src/summarization/syntactic_unit.rst deleted file mode 100644 index 5e20ec5a3e..0000000000 --- a/docs/src/summarization/syntactic_unit.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.syntactic_unit` -- Syntactic Unit class -=========================================================== - -.. automodule:: gensim.summarization.syntactic_unit - :synopsis: Syntactic Unit class - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/textcleaner.rst b/docs/src/summarization/textcleaner.rst deleted file mode 100644 index d667fd04f7..0000000000 --- a/docs/src/summarization/textcleaner.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`summarization.textcleaner` -- Preprocessing for TextRank summarization -============================================================================ - -.. automodule:: gensim.summarization.textcleaner - :synopsis: Preprocessing used in the TextRank summarization - :members: - :inherited-members: - :undoc-members: - :show-inheritance: - diff --git a/gensim/__init__.py b/gensim/__init__.py index e2ce0959df..3bc3e2c756 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -6,7 +6,7 @@ import logging -from gensim import parsing, corpora, matutils, interfaces, models, similarities, summarization, utils # noqa:F401 +from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils # noqa:F401 __version__ = '4.0.0.dev0' diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py deleted file mode 100644 index 2d1d959a29..0000000000 --- a/gensim/summarization/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ - -# bring model classes directly into package namespace, to save some typing -from .summarizer import summarize, summarize_corpus # noqa:F401 -from .keywords import keywords # noqa:F401 -from .mz_entropy import mz_keywords # noqa:F401 diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py deleted file mode 100644 index f3dc67c77a..0000000000 --- a/gensim/summarization/bm25.py +++ /dev/null @@ -1,375 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module contains function of computing rank scores for documents in -corpus and helper class `BM25` used in calculations. Original algorithm -descibed in [1]_, also you may check Wikipedia page [2]_. - - -.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, - http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf -.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25 - - -Examples --------- - -.. sourcecode:: pycon - - >>> from gensim.summarization.bm25 import get_bm25_weights - >>> corpus = [ - ... ["black", "cat", "white", "cat"], - ... ["cat", "outer", "space"], - ... ["wag", "dog"] - ... ] - >>> result = get_bm25_weights(corpus, n_jobs=-1) - -""" - -import logging -import math -from functools import partial -from multiprocessing import Pool - -from ..utils import effective_n_jobs - -PARAM_K1 = 1.5 -PARAM_B = 0.75 -EPSILON = 0.25 - -logger = logging.getLogger(__name__) - - -class BM25(): - """Implementation of the BM25 (Best Matching 25) ranking function. - - Attributes - ---------- - corpus_size : int - Size of corpus (number of documents). - avgdl : float - Average length of document in `corpus`. - doc_freqs : list of dicts of int - Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values. - idf : dict - Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values. - doc_len : list of int - List of document lengths. - - """ - - def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): - """ - Parameters - ---------- - corpus : list of list of str - Given corpus. - k1 : float - Constant used for influencing the term frequency saturation. After saturation is reached, additional - presence for the term adds a significantly less additional score. According to [1]_, experiments suggest - that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as - the type of documents or queries. - b : float - Constant used for influencing the effects of different document lengths relative to average document length. - When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to - [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value - depends on factors such as the type of documents or queries. - epsilon : float - Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts - negative idf values. Negative idf implies that adding a very common term to a document penalize the overall - score (with 'very common' meaning that it is present in more than half of the documents). That can be - undesirable as it means that an identical document would score less than an almost identical one (by - removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among - different documents) to receive an extra score. - - """ - - self.k1 = k1 - self.b = b - self.epsilon = epsilon - - self.corpus_size = 0 - self.avgdl = 0 - self.doc_freqs = [] - self.idf = {} - self.doc_len = [] - self._initialize(corpus) - - def _initialize(self, corpus): - """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" - nd = {} # word -> number of documents with word - num_doc = 0 - for document in corpus: - self.corpus_size += 1 - self.doc_len.append(len(document)) - num_doc += len(document) - - frequencies = {} - for word in document: - if word not in frequencies: - frequencies[word] = 0 - frequencies[word] += 1 - self.doc_freqs.append(frequencies) - - for word, freq in frequencies.items(): - if word not in nd: - nd[word] = 0 - nd[word] += 1 - - self.avgdl = float(num_doc) / self.corpus_size - # collect idf sum to calculate an average idf for epsilon value - idf_sum = 0 - # collect words with negative idf to set them a special epsilon value. - # idf can be negative if word is contained in more than half of documents - negative_idfs = [] - for word, freq in nd.items(): - idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) - self.idf[word] = idf - idf_sum += idf - if idf < 0: - negative_idfs.append(word) - self.average_idf = float(idf_sum) / len(self.idf) - - if self.average_idf < 0: - logger.warning( - 'Average inverse document frequency is less than zero. Your corpus of {} documents' - ' is either too small or it does not originate from natural text. BM25 may produce' - ' unintuitive results.'.format(self.corpus_size) - ) - - eps = self.epsilon * self.average_idf - for word in negative_idfs: - self.idf[word] = eps - - def get_score(self, document, index): - """Computes BM25 score of given `document` in relation to item of corpus selected by `index`. - - Parameters - ---------- - document : list of str - Document to be scored. - index : int - Index of document in corpus selected to score with `document`. - - Returns - ------- - float - BM25 score. - - """ - score = 0.0 - doc_freqs = self.doc_freqs[index] - numerator_constant = self.k1 + 1 - denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl) - for word in document: - if word in doc_freqs: - df = self.doc_freqs[index][word] - idf = self.idf[word] - score += (idf * df * numerator_constant) / (df + denominator_constant) - return score - - def get_scores(self, document): - """Computes and returns BM25 scores of given `document` in relation to - every item in corpus. - - Parameters - ---------- - document : list of str - Document to be scored. - - Returns - ------- - list of float - BM25 scores. - - """ - scores = [self.get_score(document, index) for index in range(self.corpus_size)] - return scores - - def get_scores_bow(self, document): - """Computes and returns BM25 scores of given `document` in relation to - every item in corpus. - - Parameters - ---------- - document : list of str - Document to be scored. - - Returns - ------- - list of float - BM25 scores. - - """ - scores = [] - for index in range(self.corpus_size): - score = self.get_score(document, index) - if score > 0: - scores.append((index, score)) - return scores - - -def _get_scores_bow(bm25, document): - """Helper function for retrieving bm25 scores of given `document` in parallel - in relation to every item in corpus. - - Parameters - ---------- - bm25 : BM25 object - BM25 object fitted on the corpus where documents are retrieved. - document : list of str - Document to be scored. - - Returns - ------- - list of (index, float) - BM25 scores in a bag of weights format. - - """ - return bm25.get_scores_bow(document) - - -def _get_scores(bm25, document): - """Helper function for retrieving bm25 scores of given `document` in parallel - in relation to every item in corpus. - - Parameters - ---------- - bm25 : BM25 object - BM25 object fitted on the corpus where documents are retrieved. - document : list of str - Document to be scored. - - Returns - ------- - list of float - BM25 scores. - - """ - return bm25.get_scores(document) - - -def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): - """Yield BM25 scores (weights) of documents in corpus. - Each document has to be weighted with every document in given corpus. - - Parameters - ---------- - corpus : list of list of str - Corpus of documents. - n_jobs : int - The number of processes to use for computing bm25. - k1 : float - Constant used for influencing the term frequency saturation. After saturation is reached, additional - presence for the term adds a significantly less additional score. According to [1]_, experiments suggest - that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as - the type of documents or queries. - b : float - Constant used for influencing the effects of different document lengths relative to average document length. - When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to - [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value - depends on factors such as the type of documents or queries. - epsilon : float - Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts - negative idf values. Negative idf implies that adding a very common term to a document penalize the overall - score (with 'very common' meaning that it is present in more than half of the documents). That can be - undesirable as it means that an identical document would score less than an almost identical one (by - removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among - different documents) to receive an extra score. - - Yields - ------- - list of (index, float) - BM25 scores in bag of weights format. - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.summarization.bm25 import iter_bm25_weights - >>> corpus = [ - ... ["black", "cat", "white", "cat"], - ... ["cat", "outer", "space"], - ... ["wag", "dog"] - ... ] - >>> result = iter_bm25_weights(corpus, n_jobs=-1) - - """ - bm25 = BM25(corpus, k1, b, epsilon) - - n_processes = effective_n_jobs(n_jobs) - if n_processes == 1: - for doc in corpus: - yield bm25.get_scores_bow(doc) - return - - get_score = partial(_get_scores_bow, bm25) - pool = Pool(n_processes) - - for bow in pool.imap(get_score, corpus): - yield bow - pool.close() - pool.join() - - -def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): - """Returns BM25 scores (weights) of documents in corpus. - Each document has to be weighted with every document in given corpus. - - Parameters - ---------- - corpus : list of list of str - Corpus of documents. - n_jobs : int - The number of processes to use for computing bm25. - k1 : float - Constant used for influencing the term frequency saturation. After saturation is reached, additional - presence for the term adds a significantly less additional score. According to [1]_, experiments suggest - that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as - the type of documents or queries. - b : float - Constant used for influencing the effects of different document lengths relative to average document length. - When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to - [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value - depends on factors such as the type of documents or queries. - epsilon : float - Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts - negative idf values. Negative idf implies that adding a very common term to a document penalize the overall - score (with 'very common' meaning that it is present in more than half of the documents). That can be - undesirable as it means that an identical document would score less than an almost identical one (by - removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among - different documents) to receive an extra score. - - Returns - ------- - list of list of float - BM25 scores. - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.summarization.bm25 import get_bm25_weights - >>> corpus = [ - ... ["black", "cat", "white", "cat"], - ... ["cat", "outer", "space"], - ... ["wag", "dog"] - ... ] - >>> result = get_bm25_weights(corpus, n_jobs=-1) - - """ - bm25 = BM25(corpus, k1, b, epsilon) - - n_processes = effective_n_jobs(n_jobs) - if n_processes == 1: - weights = [bm25.get_scores(doc) for doc in corpus] - return weights - - get_score = partial(_get_scores, bm25) - pool = Pool(n_processes) - weights = pool.map(get_score, corpus) - pool.close() - pool.join() - return weights diff --git a/gensim/summarization/commons.py b/gensim/summarization/commons.py deleted file mode 100644 index cdb0693bcf..0000000000 --- a/gensim/summarization/commons.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module provides functions of creating graph from sequence of values and removing of unreachable nodes. - - -Examples --------- - -Create simple graph and add edges. Let's take a look at nodes. - -.. sourcecode:: pycon - - >>> gg = build_graph(['Felidae', 'Lion', 'Tiger', 'Wolf']) - >>> gg.add_edge(("Felidae", "Lion")) - >>> gg.add_edge(("Felidae", "Tiger")) - >>> sorted(gg.nodes()) - ['Felidae', 'Lion', 'Tiger', 'Wolf'] - -Remove nodes with no edges. - -.. sourcecode:: pycon - - >>> remove_unreachable_nodes(gg) - >>> sorted(gg.nodes()) - ['Felidae', 'Lion', 'Tiger'] - -""" - -from gensim.summarization.graph import Graph - - -def build_graph(sequence): - """Creates and returns undirected graph with given sequence of values. - - Parameters - ---------- - sequence : list of hashable - Sequence of values. - - Returns - ------- - :class:`~gensim.summarization.graph.Graph` - Created graph. - - """ - graph = Graph() - for item in sequence: - if not graph.has_node(item): - graph.add_node(item) - return graph - - -def remove_unreachable_nodes(graph): - """Removes unreachable nodes (nodes with no edges), inplace. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - - """ - - for node in graph.nodes(): - if all(graph.edge_weight((node, other)) == 0 for other in graph.neighbors(node)): - graph.del_node(node) diff --git a/gensim/summarization/graph.py b/gensim/summarization/graph.py deleted file mode 100644 index 9622ef7c7f..0000000000 --- a/gensim/summarization/graph.py +++ /dev/null @@ -1,401 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module contains abstract class IGraph represents graphs interface and -class Graph (based on IGraph) which implements undirected graph. - -Examples --------- - -Create simple graph with 4 nodes. - -.. sourcecode:: pycon - - >>> g = Graph() - >>> g.add_node('Felidae') - >>> g.add_node('Lion') - >>> g.add_node('Tiger') - >>> g.add_node('Wolf') - >>> sorted(g.nodes()) - ['Felidae', 'Lion', 'Tiger', 'Wolf'] - -Add some edges and check neighbours. - -.. sourcecode:: pycon - - >>> g.add_edge(("Felidae", "Lion")) - >>> g.add_edge(("Felidae", "Tiger")) - >>> g.neighbors("Felidae") - ['Lion', 'Tiger'] - -One node has no neighbours. - -.. sourcecode:: pycon - - >>> g.neighbors("Wolf") - [] - -""" - -from abc import ABCMeta, abstractmethod - - -class IGraph(object): - """Represents the interface or contract that the graph for TextRank - should implement. - """ - __metaclass__ = ABCMeta - - @abstractmethod - def __len__(self): - """Returns number of nodes in graph""" - pass - - @abstractmethod - def nodes(self): - """Returns all nodes of graph. - - Returns - ------- - list of hashable - Nodes of graph. - - """ - pass - - @abstractmethod - def edges(self): - """Returns all edges of graph. - - Returns - ------- - list of (hashable, hashable) - Edges of graph. - - """ - pass - - @abstractmethod - def neighbors(self, node): - """Return all nodes that are directly accessible from given node. - - Parameters - ---------- - node : hashable - Given node identifier. - - Returns - ------- - list of hashable - Nodes directly accessible from given `node`. - - """ - pass - - @abstractmethod - def has_node(self, node): - """Returns whether the requested node exists. - - Parameters - ---------- - node : hashable - Given node identifier. - - Returns - ------- - bool - True if `node` exists, False otherwise. - - """ - pass - - @abstractmethod - def add_node(self, node): - """Adds given node to the graph. - - Note - ---- - While nodes can be of any type, it's strongly recommended to use only numbers and single-line strings - as node identifiers if you intend to use write(). - - Parameters - ---------- - node : hashable - Given node - - """ - pass - - @abstractmethod - def add_edge(self, edge, wt=1): - """Adds an edge to the graph connecting two nodes. An edge, here, - is a tuple of two nodes. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - wt : float, optional - Weight of new edge. - - """ - pass - - @abstractmethod - def has_edge(self, edge): - """Returns whether an edge exists. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - - Returns - ------- - bool - True if `edge` exists, False otherwise. - - """ - pass - - @abstractmethod - def edge_weight(self, edge): - """Returns weigth of given edge. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - - Returns - ------- - float - Edge weight. - - """ - pass - - @abstractmethod - def del_node(self, node): - """Removes node and its edges from the graph. - - Parameters - ---------- - node : hashable - Node to delete. - - """ - pass - - -class Graph(IGraph): - """ - Implementation of an undirected graph, based on IGraph. - - Attributes - ---------- - Graph.DEFAULT_WEIGHT : float - Weight set by default. - - """ - - DEFAULT_WEIGHT = 0 - - def __init__(self): - """Initializes object.""" - # Pairing and metadata about edges - # Mapping: Node-> - # Dict mapping of Neighbor -> weight - self.node_neighbors = {} - - def __len__(self): - """Returns number of nodes in graph""" - return len(self.node_neighbors) - - def has_edge(self, edge): - """Returns whether an edge exists. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - - Returns - ------- - bool - True if `edge` exists, False otherwise. - - """ - u, v = edge - return (u in self.node_neighbors - and v in self.node_neighbors - and v in self.node_neighbors[u] - and u in self.node_neighbors[v]) - - def edge_weight(self, edge): - """Returns weight of given edge. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - - Returns - ------- - float - Edge weight. - - """ - u, v = edge - return self.node_neighbors.get(u, {}).get(v, self.DEFAULT_WEIGHT) - - def neighbors(self, node): - """Returns all nodes that are directly accessible from given node. - - Parameters - ---------- - node : hashable - Given node identifier. - - Returns - ------- - list of hashable - Nodes directly accessible from given `node`. - - """ - return list(self.node_neighbors[node]) - - def has_node(self, node): - """Returns whether the requested node exists. - - Parameters - ---------- - node : hashable - Given node. - - Returns - ------- - bool - True if `node` exists, False otherwise. - - """ - return node in self.node_neighbors - - def add_edge(self, edge, wt=1): - """Adds an edge to the graph connecting two nodes. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - wt : float, optional - Weight of new edge. - - Raises - ------ - ValueError - If `edge` already exists in graph. - - """ - if wt == 0.0: - # empty edge is similar to no edge at all or removing it - if self.has_edge(edge): - self.del_edge(edge) - return - u, v = edge - if v not in self.node_neighbors[u] and u not in self.node_neighbors[v]: - self.node_neighbors[u][v] = wt - if u != v: - self.node_neighbors[v][u] = wt - else: - raise ValueError("Edge (%s, %s) already in graph" % (u, v)) - - def add_node(self, node): - """Adds given node to the graph. - - Note - ---- - While nodes can be of any type, it's strongly recommended - to use only numbers and single-line strings as node identifiers if you - intend to use write(). - - Parameters - ---------- - node : hashable - Given node. - - Raises - ------ - ValueError - If `node` already exists in graph. - - """ - if node in self.node_neighbors: - raise ValueError("Node %s already in graph" % node) - - self.node_neighbors[node] = {} - - def nodes(self): - """Returns all nodes of the graph. - - Returns - ------- - list of hashable - Nodes of graph. - - """ - return list(self.node_neighbors) - - def edges(self): - """Returns all edges of the graph. - - Returns - ------- - list of (hashable, hashable) - Edges of graph. - - """ - return list(self.iter_edges()) - - def iter_edges(self): - """Returns iterator of all edges of the graph. - - Yields - ------- - (hashable, hashable) - Edges of graph. - - """ - for u in self.node_neighbors: - for v in self.node_neighbors[u]: - yield (u, v) - - def del_node(self, node): - """Removes given node and its edges from the graph. - - Parameters - ---------- - node : hashable - Given node. - - """ - for each in self.neighbors(node): - if each != node: - self.del_edge((each, node)) - del self.node_neighbors[node] - - def del_edge(self, edge): - """Removes given edges from the graph. - - Parameters - ---------- - edge : (hashable, hashable) - Given edge. - - """ - u, v = edge - del self.node_neighbors[u][v] - if u != v: - del self.node_neighbors[v][u] diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py deleted file mode 100644 index d622480196..0000000000 --- a/gensim/summarization/keywords.py +++ /dev/null @@ -1,547 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module contains functions to find keywords within a text. - -Examples --------- - -.. sourcecode:: pycon - - >>> from gensim.summarization import keywords - >>> text = '''Challenges in natural language processing frequently involve - ... speech recognition, natural language understanding, natural language - ... generation (frequently from formal, machine-readable logical forms), - ... connecting language and machine perception, dialog systems, or some - ... combination thereof.''' - >>> keywords(text).split('\\n') - [u'natural language', u'machine', u'frequently'] - -""" - -from itertools import combinations as _combinations -from queue import Queue - -from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank -from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word -from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word -from gensim.summarization.commons import build_graph as _build_graph -from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes -from gensim.utils import to_unicode - - -# Number of consecutive tokens in processing. -WINDOW_SIZE = 2 - -# POS tags from http://www.clips.ua.ac.be/pages/mbsp-tags -# Use only the first two letters here. -INCLUDING_FILTER = ['NN', 'JJ'] -EXCLUDING_FILTER = [] - - -def _get_pos_filters(): - """Get default including and excluding filters as frozen sets. - - Returns - ------- - (frozenset of str, frozenset of str) - Including and excluding filters. - - """ - return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) - - -def _get_words_for_graph(tokens, pos_filter=None): - """Filters given dictionary of tokens using provided part of speech filters. - - Parameters - ---------- - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - pos_filter : iterable - Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`. - - Returns - ------- - list of str - Filtered tokens. - - Raises - ------ - ValueError - If include and exclude filters ar not empty at the same time. - - """ - if pos_filter is None: - include_filters, exclude_filters = _get_pos_filters() - else: - include_filters = set(pos_filter) - exclude_filters = frozenset([]) - if include_filters and exclude_filters: - raise ValueError("Can't use both include and exclude filters, should use only one") - - result = [] - for word, unit in tokens.items(): - if exclude_filters and unit.tag in exclude_filters: - continue - if not include_filters or not unit.tag or unit.tag in include_filters: - result.append(unit.token) - return result - - -def _get_first_window(split_text): - """Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`. - - Parameters - ---------- - split_text : list of str - Splitted text. - - Returns - ------- - list of str - First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens. - - """ - return split_text[:WINDOW_SIZE] - - -def _set_graph_edge(graph, tokens, word_a, word_b): - """Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace. - - Parameters - ---------- - graph : :class:~gensim.summarization.graph.Graph - Given graph. - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - word_a : str - First word, name of first node. - word_b : str - Second word, name of second node. - - """ - if word_a in tokens and word_b in tokens: - lemma_a = tokens[word_a].token - lemma_b = tokens[word_b].token - edge = (lemma_a, lemma_b) - - if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge): - graph.add_edge(edge) - - -def _process_first_window(graph, tokens, split_text): - """Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE` - words of `split_text` if they exist in `tokens` and `graph`, inplace. - - Parameters - ---------- - graph : :class:~gensim.summarization.graph.Graph - Given graph. - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - split_text : list of str - Splitted text. - - """ - first_window = _get_first_window(split_text) - for word_a, word_b in _combinations(first_window, 2): - _set_graph_edge(graph, tokens, word_a, word_b) - - -def _init_queue(split_text): - """Initialize queue by first words from `split_text`. - - Parameters - ---------- - split_text : list of str - Splitted text. - - Returns - ------- - Queue - Initialized queue. - - """ - queue = Queue() - first_window = _get_first_window(split_text) - for word in first_window[1:]: - queue.put(word) - return queue - - -def _process_word(graph, tokens, queue, word): - """Sets edge between `word` and each element in queue in `graph` if such nodes - exist in `tokens` and `graph`. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - queue : Queue - Given queue. - word : str - Word, possible `node` in graph and item in `tokens`. - - """ - for word_to_compare in _queue_iterator(queue): - _set_graph_edge(graph, tokens, word, word_to_compare) - - -def _update_queue(queue, word): - """Updates given `queue` (removes last item and puts `word`). - - Parameters - ---------- - queue : Queue - Given queue. - word : str - Word to be added to queue. - - """ - queue.get() - queue.put(word) - assert queue.qsize() == (WINDOW_SIZE - 1) - - -def _process_text(graph, tokens, split_text): - """Process `split_text` by updating given `graph` with new eges between nodes - if they exists in `tokens` and `graph`. - Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - split_text : list of str - Splitted text. - - """ - queue = _init_queue(split_text) - for i in range(WINDOW_SIZE, len(split_text)): - word = split_text[i] - _process_word(graph, tokens, queue, word) - _update_queue(queue, word) - - -def _queue_iterator(queue): - """Represents iterator of the given queue. - - Parameters - ---------- - queue : Queue - Given queue. - - Yields - ------ - str - Current item of queue. - - """ - iterations = queue.qsize() - for _ in range(iterations): - var = queue.get() - yield var - queue.put(var) - - -def _set_graph_edges(graph, tokens, split_text): - """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`. - Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`. - - Parameters - ---------- - graph : :class:~gensim.summarization.graph.Graph - Given graph. - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - split_text : list of str - Splitted text. - - """ - _process_first_window(graph, tokens, split_text) - _process_text(graph, tokens, split_text) - - -def _extract_tokens(lemmas, scores, ratio, words): - """Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided. - - Parameters - ---------- - lemmas : list of str - Given lemmas. - scores : dict - Dictionary with lemmas and its scores. - ratio : float - Proportion of lemmas used for final result. - words : int - Number of used words. If no "words" option is selected, the number of - sentences is reduced by the provided ratio, else, the ratio is ignored. - - Returns - ------- - list of (float, str) - Scores and corresponded lemmas. - - """ - lemmas.sort(key=lambda s: scores[s], reverse=True) - length = len(lemmas) * ratio if words is None else min(words, len(lemmas)) - return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))] - - -def _lemmas_to_words(tokens): - """Get words and lemmas from given tokens. Produces "reversed" `tokens`. - - Parameters - ---------- - tokens : dict - Original units (words) as keys and processed units (tokens) as values. - - Returns - ------- - dict - Lemmas as keys and lists corresponding words as values. - - """ - lemma_to_word = {} - for word, unit in tokens.items(): - lemma = unit.token - if lemma in lemma_to_word: - lemma_to_word[lemma].append(word) - else: - lemma_to_word[lemma] = [word] - return lemma_to_word - - -def _get_keywords_with_score(extracted_lemmas, lemma_to_word): - """Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`. - - Parameters - ---------- - extracted_lemmas : list of (float, str) - Given lemmas with scores - lemma_to_word : dict - Lemmas and corresponding words. - - Returns - ------- - dict - Keywords as keys and its scores as values. - - """ - - keywords = {} - for score, lemma in extracted_lemmas: - keyword_list = lemma_to_word[lemma] - for keyword in keyword_list: - keywords[keyword] = score - return keywords - - -def _strip_word(word): - """Get cleaned `word`. - - Parameters - ---------- - word : str - Given word. - - Returns - ------- - str - Cleaned word. - """ - stripped_word_list = list(_tokenize_by_word(word)) - return stripped_word_list[0] if stripped_word_list else "" - - -def _get_combined_keywords(_keywords, split_text): - """Get most scored words (`_keywords`) contained in `split_text` and it's combinations. - - Parameters - ---------- - _keywords : dict - Keywords as keys and its scores as values. - split_text : list of str - Splitted text. - - Returns - ------- - list of str - Keywords and/or its combinations. - - """ - result = [] - _keywords = _keywords.copy() - len_text = len(split_text) - for i in range(len_text): - word = _strip_word(split_text[i]) - if word in _keywords: - combined_word = [word] - if i + 1 == len_text: - result.append(word) # appends last word if keyword and doesn't iterate - for j in range(i + 1, len_text): - other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word: - combined_word.append(other_word) - else: - for keyword in combined_word: - _keywords.pop(keyword) - result.append(" ".join(combined_word)) - break - return result - - -def _get_average_score(concept, _keywords): - """Get average score of words in `concept`. - - Parameters - ---------- - concept : str - Input text. - _keywords : dict - Keywords as keys and its scores as values. - - Returns - ------- - float - Average score. - - """ - word_list = concept.split() - word_counter = len(word_list) - total = float(sum(_keywords[word] for word in word_list)) - return total / word_counter - - -def _format_results(keywords, combined_keywords, split, scores): - """Format, sort and return `combined_keywords`. - - Parameters - ---------- - keywords : dict - Keywords as keys and its scores as values. - combined_keywords : list of str - Most ranked words and/or its combinations. - split : bool - Split result if True or return string otherwise, optional. - scores : bool - Whether return `combined_keywords` with scores, optional. If True - `split` is ignored. - - Returns - ------- - result: list of (str, float) - If `scores`, keywords with scores **OR** - result: list of str - If `split`, keywords only **OR** - result: str - Keywords, joined by newline character. - - """ - combined_keywords.sort(key=lambda w: _get_average_score(w, keywords), reverse=True) - if scores: - return [(word, _get_average_score(word, keywords)) for word in combined_keywords] - if split: - return combined_keywords - return "\n".join(combined_keywords) - - -def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), - lemmatize=False, deacc=True): - """Get the most ranked words of provided text and/or its combinations. - - Parameters - ---------- - - text : str - Input text. - ratio : float, optional - If no "words" option is selected, the number of sentences is reduced by the provided ratio, - else, the ratio is ignored. - words : int, optional - Number of returned words. - split : bool, optional - Whether split keywords if True. - scores : bool, optional - Whether score of keyword. - pos_filter : tuple, optional - Part of speech filters. - lemmatize : bool, optional - If True - lemmatize words. - deacc : bool, optional - If True - remove accentuation. - - Returns - ------- - result: list of (str, float) - If `scores`, keywords with scores **OR** - result: list of str - If `split`, keywords only **OR** - result: str - Keywords, joined by endl. - - """ - # Gets a dict of word -> lemma - text = to_unicode(text) - tokens = _clean_text_by_word(text, deacc=deacc) - split_text = list(_tokenize_by_word(text)) - - # Creates the graph and adds the edges - graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) - _set_graph_edges(graph, tokens, split_text) - del split_text # It's no longer used - - _remove_unreachable_nodes(graph) - - if not any(True for _ in graph.iter_edges()): - return _format_results([], [], split, scores) - - # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score - pagerank_scores = _pagerank(graph) - - extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) - - # The results can be polluted by many variations of the same word - if lemmatize: - lemmas_to_word = {} - for word, unit in tokens.items(): - lemmas_to_word[unit.token] = [word] - else: - lemmas_to_word = _lemmas_to_words(tokens) - - keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) - - # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined - combined_keywords = _get_combined_keywords(keywords, text.split()) - - return _format_results(keywords, combined_keywords, split, scores) - - -def get_graph(text): - """Creates and returns graph from given text, cleans and tokenize text before building graph. - - Parameters - ---------- - text : str - Sequence of values. - - Returns - ------- - :class:`~gensim.summarization.graph.Graph` - Created graph. - - """ - tokens = _clean_text_by_word(text) - split_text = list(_tokenize_by_word(text)) - - graph = _build_graph(_get_words_for_graph(tokens)) - _set_graph_edges(graph, tokens, split_text) - - return graph diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py deleted file mode 100644 index 75d667e906..0000000000 --- a/gensim/summarization/mz_entropy.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word -from gensim.utils import to_unicode -import numpy as np -import scipy - - -def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): - """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ - - Parameters - ---------- - text: str - Document for summarization. - blocksize: int, optional - Size of blocks to use in analysis. - scores: bool, optional - Whether to return score with keywords. - split: bool, optional - Whether to return results as list. - weighted: bool, optional - Whether to weight scores by word frequency. - False can useful for shorter texts, and allows automatic thresholding. - threshold: float or 'auto', optional - Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, - use 'auto' with `weighted=False`. - - Returns - ------- - results: str - newline separated keywords if `split` == False **OR** - results: list(str) - list of keywords if `scores` == False **OR** - results: list(tuple(str, float)) - list of (keyword, score) tuples if `scores` == True - - Results are returned in descending order of score regardless of the format. - - Note - ---- - This algorithm looks for keywords that contribute to the structure of the - text on scales of `blocksize` words of larger. It is suitable for extracting - keywords representing the major themes of long texts. - - References - ---------- - .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in - written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, - DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 - - """ - text = to_unicode(text) - words = [word for word in _tokenize_by_word(text)] - vocab = sorted(set(words)) - word_counts = count_freqs_by_blocks(words, vocab, blocksize) - n_blocks = word_counts.shape[0] - totals = word_counts.sum(axis=0) - n_words = totals.sum() - p = word_counts / totals - log_p = np.log2(p) - h = np.nan_to_num(p * log_p).sum(axis=0) - analytic = __analytic_entropy(blocksize, n_blocks, n_words) - h += analytic(totals).astype('d', copy=False) - if weighted: - h *= totals / n_words - if threshold == 'auto': - threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 - weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] - weights.sort(key=lambda x: -x[1]) - result = weights if scores else [word for (word, score) in weights] - if not (scores or split): - result = '\n'.join(result) - return result - - -def count_freqs_by_blocks(words, vocab, blocksize): - """Count word frequencies in chunks - - Parameters - ---------- - words: list(str) - List of all words. - vocab: list(str) - List of words in vocabulary. - blocksize: int - Size of blocks to use for count. - - Returns - ------- - results: numpy.array(list(double)) - Array of list of word frequencies in one chunk. - The order of word frequencies is the same as words in vocab. - """ - word2ind = {word: i for i, word in enumerate(vocab)} - - word_counts = [] - for i in range(0, len(words), blocksize): - counts = [0] * len(vocab) - for word in words[i: i + blocksize]: - counts[word2ind[word]] += 1 - word_counts.append(counts) - return np.array(word_counts, dtype=np.double) - - -def __log_combinations_inner(n, m): - """Calculates the logarithm of n!/m!(n-m)!""" - return -(np.log(n + 1) + scipy.special.betaln(n - m + 1, m + 1)) - - -__log_combinations = np.frompyfunc(__log_combinations_inner, 2, 1) - - -def __marginal_prob(blocksize, n_words): - - def marginal_prob(n, m): - """Marginal probability of a word that occurs n times in the document - occurring m times in a given block""" - - return np.exp( - __log_combinations(n, m) - + __log_combinations(n_words - n, blocksize - m) - - __log_combinations(n_words, blocksize) - ) - - return np.frompyfunc(marginal_prob, 2, 1) - - -def __analytic_entropy(blocksize, n_blocks, n_words): - marginal = __marginal_prob(blocksize, n_words) - cache = {1: 0.0} # special case - - def analytic_entropy(n): - """Predicted entropy for a word that occurs n times in the document""" - n = int(n) - if n in cache: - return cache[n] - m = np.arange(1, min(blocksize, n) + 1, dtype=np.double) - p = m / n - # m >= 1, so p > 0 and np.log2(p) != nan - elements = (p * np.log2(p)) * marginal(n, m) - result = -n_blocks * elements.sum() - - cache[n] = result - return result - - return np.frompyfunc(analytic_entropy, 1, 1) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py deleted file mode 100644 index 50562eb032..0000000000 --- a/gensim/summarization/pagerank_weighted.py +++ /dev/null @@ -1,190 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module calculate PageRank [1]_ based on wordgraph. - - -.. [1] https://en.wikipedia.org/wiki/PageRank - -Examples --------- - -Calculate Pagerank for words - -.. sourcecode:: pycon - - >>> from gensim.summarization.keywords import get_graph - >>> from gensim.summarization.pagerank_weighted import pagerank_weighted - >>> graph = get_graph("The road to hell is paved with good intentions.") - >>> # result will looks like {'good': 0.70432858653171504, 'hell': 0.051128871128006126, ...} - >>> result = pagerank_weighted(graph) - -Build matrix from graph - -.. sourcecode:: pycon - - >>> from gensim.summarization.pagerank_weighted import build_adjacency_matrix - >>> build_adjacency_matrix(graph).todense() - matrix([[ 0., 0., 0., 0., 0.], - [ 0., 0., 1., 0., 0.], - [ 0., 1., 0., 0., 0.], - [ 0., 0., 0., 0., 0.], - [ 0., 0., 0., 0., 0.]]) - -""" - - -import numpy -from numpy import empty as empty_matrix -from scipy.linalg import eig -from scipy.sparse import csr_matrix -from scipy.sparse.linalg import eigs -from six.moves import range - -from gensim.utils import deprecated - - -def pagerank_weighted(graph, damping=0.85): - """Get dictionary of `graph` nodes and its ranks. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - damping : float - Damping parameter, optional - - Returns - ------- - dict - Nodes of `graph` as keys, its ranks as values. - - """ - coeff_adjacency_matrix = build_adjacency_matrix(graph, coeff=damping) - probabilities = (1 - damping) / float(len(graph)) - - pagerank_matrix = coeff_adjacency_matrix.toarray() - # trying to minimize memory allocations - pagerank_matrix += probabilities - - vec = principal_eigenvector(pagerank_matrix.T) - - # Because pagerank_matrix is positive, vec is always real (i.e. not complex) - return process_results(graph, vec.real) - - -def build_adjacency_matrix(graph, coeff=1): - """Get matrix representation of given `graph`. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - coeff : float - Matrix values coefficient, optonal. - - Returns - ------- - :class:`scipy.sparse.csr_matrix`, shape = [n, n] - Adjacency matrix of given `graph`, n is number of nodes. - - """ - row = [] - col = [] - data = [] - nodes = graph.nodes() - nodes2id = {v: i for i, v in enumerate(nodes)} - length = len(nodes) - - for i in range(length): - current_node = nodes[i] - neighbors = graph.neighbors(current_node) - neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in neighbors) - for neighbor in neighbors: - edge_weight = float(graph.edge_weight((current_node, neighbor))) - if edge_weight != 0.0: - row.append(i) - col.append(nodes2id[neighbor]) - data.append(coeff * edge_weight / neighbors_sum) - - return csr_matrix((data, (row, col)), shape=(length, length)) - - -@deprecated("Function will be removed in 4.0.0") -def build_probability_matrix(graph, coeff=1.0): - """Get square matrix of shape (n, n), where n is number of nodes of the - given `graph`. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - coeff : float - Matrix values coefficient, optonal. - - Returns - ------- - numpy.ndarray, shape = [n, n] - Eigenvector of matrix `a`, n is number of nodes of `graph`. - - """ - dimension = len(graph) - matrix = empty_matrix((dimension, dimension)) - - probability = coeff / float(dimension) - matrix.fill(probability) - - return matrix - - -def principal_eigenvector(a): - """Get eigenvector of square matrix `a`. - - Parameters - ---------- - a : numpy.ndarray, shape = [n, n] - Given matrix. - - Returns - ------- - numpy.ndarray, shape = [n, ] - Eigenvector of matrix `a`. - - """ - # Note that we prefer to use `eigs` even for dense matrix - # because we need only one eigenvector. See #441, #438 for discussion. - - # But it doesn't work for dim A < 3, so we just handle this special case - if len(a) < 3: - vals, vecs = eig(a) - ind = numpy.abs(vals).argmax() - return vecs[:, ind] - else: - vals, vecs = eigs(a, k=1) - return vecs[:, 0] - - -def process_results(graph, vec): - """Get `graph` nodes and corresponding absolute values of provided eigenvector. - This function is helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted` - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - vec : numpy.ndarray, shape = [n, ] - Given eigenvector, n is number of nodes of `graph`. - - Returns - ------- - dict - Graph nodes as keys, corresponding elements of eigenvector as values. - - """ - scores = {} - for i, node in enumerate(graph.nodes()): - scores[node] = abs(vec[i]) - - return scores diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py deleted file mode 100644 index d0a905dd5d..0000000000 --- a/gensim/summarization/summarizer.py +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module provides functions for summarizing texts. Summarizing is based on -ranks of text sentences using a variation of the TextRank algorithm [1]_. - -.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). - Variations of the Similarity Function of TextRank for Automated Summarization, - https://arxiv.org/abs/1602.03606 - -Example -------- - -.. sourcecode:: pycon - - >>> from gensim.summarization.summarizer import summarize - >>> text = '''Rice Pudding - Poem by Alan Alexander Milne - ... What is the matter with Mary Jane? - ... She's crying with all her might and main, - ... And she won't eat her dinner - rice pudding again - - ... What is the matter with Mary Jane? - ... What is the matter with Mary Jane? - ... I've promised her dolls and a daisy-chain, - ... And a book about animals - all in vain - - ... What is the matter with Mary Jane? - ... What is the matter with Mary Jane? - ... She's perfectly well, and she hasn't a pain; - ... But, look at her, now she's beginning again! - - ... What is the matter with Mary Jane? - ... What is the matter with Mary Jane? - ... I've promised her sweets and a ride in the train, - ... And I've begged her to stop for a bit and explain - - ... What is the matter with Mary Jane? - ... What is the matter with Mary Jane? - ... She's perfectly well and she hasn't a pain, - ... And it's lovely rice pudding for dinner again! - ... What is the matter with Mary Jane?''' - >>> print(summarize(text)) - And she won't eat her dinner - rice pudding again - - I've promised her dolls and a daisy-chain, - I've promised her sweets and a ride in the train, - And it's lovely rice pudding for dinner again! - -""" - -import logging -from math import log10 as _log10 - -from gensim.utils import deprecated -from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank -from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences -from gensim.summarization.commons import build_graph as _build_graph -from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes -from gensim.summarization.bm25 import iter_bm25_bow as _bm25_weights -from gensim.corpora import Dictionary - -# Minimum number of sentences in a text. Smaller texts will log a warning. -INPUT_MIN_LENGTH = 10 - -# Minimal weight of edge between graph nodes. Smaller weights set to zero. -WEIGHT_THRESHOLD = 1.e-3 - -logger = logging.getLogger(__name__) - - -def _set_graph_edge_weights(graph): - """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small, - forces all weights to 1, inplace. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - - """ - documents = graph.nodes() - weights = _bm25_weights(documents) - - for i, doc_bow in enumerate(weights): - if i % 1000 == 0 and i > 0: - logger.info('PROGRESS: processing %s/%s doc (%s non zero elements)', i, len(documents), len(doc_bow)) - - for j, weight in doc_bow: - if i == j or weight < WEIGHT_THRESHOLD: - continue - - edge = (documents[i], documents[j]) - - if not graph.has_edge(edge): - graph.add_edge(edge, weight) - - # Handles the case in which all similarities are zero. - # The resultant summary will consist of random sentences. - if all(graph.edge_weight(edge) == 0 for edge in graph.iter_edges()): - _create_valid_graph(graph) - - -def _create_valid_graph(graph): - """Sets all weights of edges for different edges as 1, inplace. - - Parameters - ---------- - graph : :class:`~gensim.summarization.graph.Graph` - Given graph. - - """ - nodes = graph.nodes() - - for i in range(len(nodes)): - for j in range(len(nodes)): - if i == j: - continue - - edge = (nodes[i], nodes[j]) - - if graph.has_edge(edge): - graph.del_edge(edge) - - graph.add_edge(edge, 1) - - -@deprecated("Function will be removed in 4.0.0") -def _get_doc_length(doc): - """Get length of (tokenized) document. - - Parameters - ---------- - doc : list of (list of (tuple of int)) - Given document. - - Returns - ------- - int - Length of document. - - """ - return sum(item[1] for item in doc) - - -@deprecated("Function will be removed in 4.0.0") -def _get_similarity(doc1, doc2, vec1, vec2): - """Returns similarity of two documents. - - Parameters - ---------- - doc1 : list of (list of (tuple of int)) - First document. - doc2 : list of (list of (tuple of int)) - Second document. - vec1 : array - ? of first document. - vec1 : array - ? of secont document. - - Returns - ------- - float - Similarity of two documents. - - """ - numerator = vec1.dot(vec2.transpose()).toarray()[0][0] - length_1 = _get_doc_length(doc1) - length_2 = _get_doc_length(doc2) - - denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 - - return numerator / denominator if denominator != 0 else 0 - - -def _build_corpus(sentences): - """Construct corpus from provided sentences. - - Parameters - ---------- - sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Given sentences. - - Returns - ------- - list of list of (int, int) - Corpus built from sentences. - - """ - split_tokens = [sentence.token.split() for sentence in sentences] - dictionary = Dictionary(split_tokens) - return [dictionary.doc2bow(token) for token in split_tokens] - - -def _get_important_sentences(sentences, corpus, important_docs): - """Get most important sentences. - - Parameters - ---------- - sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Given sentences. - corpus : list of list of (int, int) - Provided corpus. - important_docs : list of list of (int, int) - Most important documents of the corpus. - - Returns - ------- - list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Most important sentences. - - """ - hashable_corpus = _build_hasheable_corpus(corpus) - sentences_by_corpus = dict(zip(hashable_corpus, sentences)) - return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] - - -def _get_sentences_with_word_count(sentences, word_count): - """Get list of sentences. Total number of returned words close to specified `word_count`. - - Parameters - ---------- - sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Given sentences. - word_count : int or None - Number of returned words. If None full most important sentences will be returned. - - Returns - ------- - list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Most important sentences. - - """ - length = 0 - selected_sentences = [] - - # Loops until the word count is reached. - for sentence in sentences: - words_in_sentence = len(sentence.text.split()) - - # Checks if the inclusion of the sentence gives a better approximation - # to the word parameter. - if abs(word_count - length - words_in_sentence) > abs(word_count - length): - return selected_sentences - - selected_sentences.append(sentence) - length += words_in_sentence - - return selected_sentences - - -def _extract_important_sentences(sentences, corpus, important_docs, word_count): - """Get most important sentences of the `corpus`. - - Parameters - ---------- - sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Given sentences. - corpus : list of list of (int, int) - Provided corpus. - important_docs : list of list of (int, int) - Most important docs of the corpus. - word_count : int - Number of returned words. If None full most important sentences will be returned. - - Returns - ------- - list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Most important sentences. - - """ - important_sentences = _get_important_sentences(sentences, corpus, important_docs) - - # If no "word_count" option is provided, the number of sentences is - # reduced by the provided ratio. Else, the ratio is ignored. - return important_sentences \ - if word_count is None \ - else _get_sentences_with_word_count(important_sentences, word_count) - - -def _format_results(extracted_sentences, split): - """Returns `extracted_sentences` in desired format. - - Parameters - ---------- - extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - Given sentences. - split : bool - If True sentences will be returned as list. Otherwise sentences will be merged and returned as string. - - Returns - ------- - list of str - If `split` **OR** - str - Formatted result. - - """ - if split: - return [sentence.text for sentence in extracted_sentences] - return "\n".join(sentence.text for sentence in extracted_sentences) - - -def _build_hasheable_corpus(corpus): - """Hashes and get `corpus`. - - Parameters - ---------- - corpus : list of list of (int, int) - Given corpus. - - Returns - ------- - list of list of (int, int) - Hashable corpus. - - """ - return [tuple(doc) for doc in corpus] - - -def summarize_corpus(corpus, ratio=0.2): - """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. - Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` - - Note - ---- - The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary - to make sense. - - - Parameters - ---------- - corpus : list of list of (int, int) - Given corpus. - ratio : float, optional - Number between 0 and 1 that determines the proportion of the number of - sentences of the original text to be chosen for the summary, optional. - - Returns - ------- - list of str - Most important documents of given `corpus` sorted by the document score, highest first. - - """ - hashable_corpus = _build_hasheable_corpus(corpus) - - # If the corpus is empty, the function ends. - if len(corpus) == 0: - logger.warning("Input corpus is empty.") - return [] - - # Warns the user if there are too few documents. - if len(corpus) < INPUT_MIN_LENGTH: - logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) - - logger.info('Building graph') - graph = _build_graph(hashable_corpus) - - logger.info('Filling graph') - _set_graph_edge_weights(graph) - - logger.info('Removing unreachable nodes of graph') - _remove_unreachable_nodes(graph) - - # Cannot calculate eigenvectors if number of unique documents in corpus < 3. - # Warns user to add more text. The function ends. - if len(graph.nodes()) < 3: - logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") - return [] - - logger.info('Pagerank graph') - pagerank_scores = _pagerank(graph) - - logger.info('Sorting pagerank scores') - hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) - - return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]] - - -def summarize(text, ratio=0.2, word_count=None, split=False): - """Get a summarized version of the given text. - - The output summary will consist of the most representative sentences - and will be returned as a string, divided by newlines. - - Note - ---- - The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` - sentences for the summary to make sense. - The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` - module. Note that newlines divide sentences. - - - Parameters - ---------- - text : str - Given text. - ratio : float, optional - Number between 0 and 1 that determines the proportion of the number of - sentences of the original text to be chosen for the summary. - word_count : int or None, optional - Determines how many words will the output contain. - If both parameters are provided, the ratio will be ignored. - split : bool, optional - If True, list of sentences will be returned. Otherwise joined - strings will bwe returned. - - Returns - ------- - list of str - If `split` **OR** - str - Most representative sentences of given the text. - - """ - # Gets a list of processed sentences. - sentences = _clean_text_by_sentences(text) - - # If no sentence could be identified, the function ends. - if len(sentences) == 0: - logger.warning("Input text is empty.") - return [] if split else u"" - - # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). - if len(sentences) == 1: - raise ValueError("input must have more than one sentence") - - # Warns if the text is too short. - if len(sentences) < INPUT_MIN_LENGTH: - logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) - - corpus = _build_corpus(sentences) - - most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) - - # If couldn't get important docs, the algorithm ends. - if not most_important_docs: - logger.warning("Couldn't get relevant sentences.") - return [] if split else u"" - - # Extracts the most important sentences with the selected criterion. - extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) - - # Sorts the extracted sentences by apparition order in the original text. - extracted_sentences.sort(key=lambda s: s.index) - - return _format_results(extracted_sentences, split) diff --git a/gensim/summarization/syntactic_unit.py b/gensim/summarization/syntactic_unit.py deleted file mode 100644 index 2926e4fe1b..0000000000 --- a/gensim/summarization/syntactic_unit.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module contains implementation of SyntacticUnit class. It generally used while text cleaning. -:class:`~gensim.summarization.syntactic_unit.SyntacticUnit` represents printable version of provided text. - -""" - - -class SyntacticUnit(object): - """SyntacticUnit class. - - Attributes - ---------- - text : str - Input text. - token : str - Tokenized text. - tag : str - Tag of unit, optional. - index : int - Index of sytactic unit in corpus, optional. - score : float - Score of synctatic unit, optional. - - """ - - def __init__(self, text, token=None, tag=None, index=-1): - """ - - Parameters - ---------- - text : str - Input text. - token : str - Tokenized text, optional. - tag : str - Tag of unit, optional. - - """ - self.text = text - self.token = token - self.tag = tag[:2] if tag else None # Just first two letters of tag - self.index = index - self.score = -1 - - def __str__(self): - return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'" - - def __repr__(self): - return str(self) diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py deleted file mode 100644 index e6f7069c80..0000000000 --- a/gensim/summarization/textcleaner.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module contains functions and processors used for processing text, -extracting sentences from text, working with acronyms and abbreviations. - -""" - -import re -import logging - -from gensim.summarization.syntactic_unit import SyntacticUnit -from gensim.parsing.preprocessing import preprocess_documents -from gensim.utils import tokenize, has_pattern - -logger = logging.getLogger(__name__) - -HAS_PATTERN = has_pattern() -if HAS_PATTERN: - from pattern.en import tag - -# Special separator used in abbreviations. -SEPARATOR = r'@' - -# Pattern to split text to sentences. -RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) - -# Pattern for detecting abbreviations (example: Sgt. Pepper). -AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) - -# Pattern for detecting acronyms. -AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) - -# Pattern for detecting acronyms (example: P.S. I love you). -AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) - -# Like AB_SENIOR but with SEPARATOR between abbreviation and next word. -UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) - -# Like AB_ACRONYM but with SEPARATOR between abbreviation and next word. -UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) - - -def split_sentences(text): - """Split and get list of sentences from given text. It preserves abbreviations set in - :const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`. - - Parameters - ---------- - text : str - Input text. - - Returns - ------- - list of str - Sentences of given text. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.summarization.textcleaner import split_sentences - >>> text = '''Beautiful is better than ugly. - ... Explicit is better than implicit. Simple is better than complex.''' - >>> split_sentences(text) - ['Beautiful is better than ugly.', - 'Explicit is better than implicit.', - 'Simple is better than complex.'] - - """ - processed = replace_abbreviations(text) - return [undo_replacement(sentence) for sentence in get_sentences(processed)] - - -def replace_abbreviations(text): - """Replace blank space to '@' separator after abbreviation and next word. - - Parameters - ---------- - text : str - Input sentence. - - Returns - ------- - str - Sentence with changed separator. - - Example - ------- - .. sourcecode:: pycon - - >>> replace_abbreviations("God bless you, please, Mrs. Robinson") - God bless you, please, Mrs.@Robinson - - """ - return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) - - -def undo_replacement(sentence): - """Replace `@` separator back to blank space after each abbreviation. - - Parameters - ---------- - sentence : str - Input sentence. - - Returns - ------- - str - Sentence with changed separator. - - Example - ------- - >>> undo_replacement("God bless you, please, Mrs.@Robinson") - God bless you, please, Mrs. Robinson - - """ - return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) - - -def replace_with_separator(text, separator, regexs): - """Get text with replaced separator if provided regular expressions were matched. - - Parameters - ---------- - text : str - Input text. - separator : str - The separator between words to be replaced. - regexs : list of `_sre.SRE_Pattern` - Regular expressions used in processing text. - - Returns - ------- - str - Text with replaced separators. - - """ - replacement = r"\1" + separator + r"\2" - result = text - for regex in regexs: - result = regex.sub(replacement, result) - return result - - -def get_sentences(text): - """Sentence generator from provided text. Sentence pattern set - in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`. - - Parameters - ---------- - text : str - Input text. - - Yields - ------ - str - Single sentence extracted from text. - - Example - ------- - .. sourcecode:: pycon - - >>> text = "Does this text contains two sentences? Yes, it does." - >>> for sentence in get_sentences(text): - >>> print(sentence) - Does this text contains two sentences? - Yes, it does. - - """ - for match in RE_SENTENCE.finditer(text): - yield match.group() - - -def merge_syntactic_units(original_units, filtered_units, tags=None): - """Process given sentences and its filtered (tokenized) copies into - :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`. Also adds tags if they are provided to produced units. - - Parameters - ---------- - original_units : list - List of original sentences. - filtered_units : list - List of tokenized sentences. - tags : list of str, optional - List of strings used as tags for each unit. None as deafault. - - Returns - ------- - list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit - List of syntactic units (sentences). - - """ - units = [] - for i in range(len(original_units)): - if filtered_units[i] == '': - continue - - text = original_units[i] - token = filtered_units[i] - tag = tags[i][1] if tags else None - sentence = SyntacticUnit(text, token, tag, i) - - units.append(sentence) - - return units - - -def join_words(words, separator=" "): - """Concatenates `words` with `separator` between elements. - - Parameters - ---------- - words : list of str - Given words. - separator : str, optional - The separator between elements. - - Returns - ------- - str - String of merged words with separator between elements. - - """ - return separator.join(words) - - -def clean_text_by_sentences(text): - """Tokenize a given text into sentences, applying filters and lemmatize them. - - Parameters - ---------- - text : str - Given text. - - Returns - ------- - list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` - Sentences of the given text. - - """ - original_sentences = split_sentences(text) - filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] - - return merge_syntactic_units(original_sentences, filtered_sentences) - - -def clean_text_by_word(text, deacc=True): - """Tokenize a given text into words, applying filters and lemmatize them. - - Parameters - ---------- - text : str - Given text. - deacc : bool, optional - Remove accentuation if True. - - Returns - ------- - dict - Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.summarization.textcleaner import clean_text_by_word - >>> clean_text_by_word("God helps those who help themselves") - {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', - 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', - 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} - - """ - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) - original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) - filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] - if HAS_PATTERN: - tags = tag(join_words(original_words)) # tag needs the context of the words in the text - else: - tags = None - units = merge_syntactic_units(original_words, filtered_words, tags) - return {unit.text: unit for unit in units} - - -def tokenize_by_word(text): - """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set - :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. - - Parameters - ---------- - text : str - Given text. - - Returns - ------- - generator - Generator that yields sequence words of the given text. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.summarization.textcleaner import tokenize_by_word - >>> g = tokenize_by_word('Veni. Vedi. Vici.') - >>> print(next(g)) - veni - >>> print(next(g)) - vedi - >>> print(next(g)) - vici - - """ - text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) - return tokenize(text_without_acronyms, to_lower=True, deacc=True) diff --git a/gensim/test/test_BM25.py b/gensim/test/test_BM25.py deleted file mode 100644 index eb63ddc328..0000000000 --- a/gensim/test/test_BM25.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - -import logging -import unittest - -from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25 -from gensim.test.utils import common_texts - - -class TestBM25(unittest.TestCase): - def test_max_match_with_itself(self): - """ Document should show maximum matching with itself """ - weights = get_bm25_weights(common_texts) - for index, doc_weights in enumerate(weights): - expected = max(doc_weights) - predicted = doc_weights[index] - self.assertAlmostEqual(expected, predicted) - - def test_with_generator(self): - """ Check above function with input as generator """ - text_gen = (i for i in common_texts) - weights = get_bm25_weights(text_gen) - for index, doc_weights in enumerate(weights): - expected = max(doc_weights) - predicted = doc_weights[index] - self.assertAlmostEqual(expected, predicted) - - def test_nonnegative_weights(self): - """ All the weights for a partiular document should be non negative """ - weights = get_bm25_weights(common_texts) - for doc_weights in weights: - for weight in doc_weights: - self.assertTrue(weight >= 0.) - - def test_same_match_with_same_document(self): - """ A document should always get the same weight when matched with a particular document """ - corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] - weights = get_bm25_weights(corpus) - self.assertAlmostEqual(weights[0][1], weights[0][2]) - - def test_disjoint_docs_if_weight_zero(self): - """ Two disjoint documents should have zero matching""" - corpus = [['cat', 'dog', 'lion'], ['goat', 'fish', 'tiger']] - weights = get_bm25_weights(corpus) - self.assertAlmostEqual(weights[0][1], 0) - self.assertAlmostEqual(weights[1][0], 0) - - def test_multiprocessing(self): - """ Result should be the same using different processes """ - weights1 = get_bm25_weights(common_texts) - weights2 = get_bm25_weights(common_texts, n_jobs=2) - weights3 = get_bm25_weights(common_texts, n_jobs=-1) - self.assertAlmostEqual(weights1, weights2) - self.assertAlmostEqual(weights1, weights3) - self.assertAlmostEqual(weights2, weights3) - - def test_k1(self): - """ Changing the k1 parameter should give consistent results """ - corpus = common_texts - index = 0 - doc = corpus[index] - first_k1 = 1.0 - second_k1 = 2.0 - - first_bm25 = BM25(corpus, k1=first_k1) - second_bm25 = BM25(corpus, k1=second_k1) - first_score = first_bm25.get_score(doc, index) - second_score = second_bm25.get_score(doc, index) - self.assertLess(first_score, second_score) - - first_iter = iter_bm25_bow(corpus, k1=first_k1) - second_iter = iter_bm25_bow(corpus, k1=second_k1) - first_score = dict(next(iter(first_iter)))[index] - second_score = dict(next(iter(second_iter)))[index] - self.assertLess(first_score, second_score) - - first_weights = get_bm25_weights(corpus, k1=first_k1) - second_weights = get_bm25_weights(corpus, k1=second_k1) - first_score = first_weights[index] - second_score = second_weights[index] - self.assertLess(first_score, second_score) - - def test_b(self): - """ Changing the b parameter should give consistent results """ - corpus = common_texts - index = 0 - doc = corpus[index] - first_b = 1.0 - second_b = 2.0 - - first_bm25 = BM25(corpus, b=first_b) - second_bm25 = BM25(corpus, b=second_b) - first_score = first_bm25.get_score(doc, index) - second_score = second_bm25.get_score(doc, index) - self.assertLess(first_score, second_score) - - first_iter = iter_bm25_bow(corpus, b=first_b) - second_iter = iter_bm25_bow(corpus, b=second_b) - first_score = dict(next(iter(first_iter)))[index] - second_score = dict(next(iter(second_iter)))[index] - self.assertLess(first_score, second_score) - - first_weights = get_bm25_weights(corpus, b=first_b) - second_weights = get_bm25_weights(corpus, b=second_b) - first_score = first_weights[index] - second_score = second_weights[index] - self.assertLess(first_score, second_score) - - def test_epsilon(self): - """ Changing the b parameter should give consistent results """ - corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] - first_epsilon = 1.0 - second_epsilon = 2.0 - bm25 = BM25(corpus) - words_with_negative_idfs = set([ - word - for word, idf in bm25.idf.items() - if idf < 0 - ]) - index, doc = [ - (index, document) - for index, document - in enumerate(corpus) - if words_with_negative_idfs & set(document) - ][0] - - first_bm25 = BM25(corpus, epsilon=first_epsilon) - second_bm25 = BM25(corpus, epsilon=second_epsilon) - first_score = first_bm25.get_score(doc, index) - second_score = second_bm25.get_score(doc, index) - self.assertGreater(first_score, second_score) - - first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) - second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) - first_score = dict(next(iter(first_iter)))[index] - second_score = dict(next(iter(second_iter)))[index] - self.assertGreater(first_score, second_score) - - first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) - second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) - first_score = first_weights[index] - second_score = second_weights[index] - self.assertGreater(first_score, second_score) - - -if __name__ == '__main__': - logging.basicConfig(level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_data/mihalcea_tarau.kw.txt b/gensim/test/test_data/mihalcea_tarau.kw.txt deleted file mode 100644 index b8ea0cabc3..0000000000 --- a/gensim/test/test_data/mihalcea_tarau.kw.txt +++ /dev/null @@ -1,21 +0,0 @@ -gilbert -hurricane -winds -coast -storm -saturday -flood -flooding -weather -alert -defense alerted -strong -people -pushed -puerto -cabral said -north -associated -south -domingo -residents diff --git a/gensim/test/test_data/mihalcea_tarau.kwpos.txt b/gensim/test/test_data/mihalcea_tarau.kwpos.txt deleted file mode 100644 index 7e14dfaae3..0000000000 --- a/gensim/test/test_data/mihalcea_tarau.kwpos.txt +++ /dev/null @@ -1,30 +0,0 @@ -gilbert -hurricane -coast -storm -saturday -winds heavy -flood -flooding -weather -alert -defense alerted -strong -pushed -people -puerto -cabral said -north -associated -south -domingo -residents -dominican -miles -southeast -san -civil -home -reached -juan -named diff --git a/gensim/test/test_data/mihalcea_tarau.summ.txt b/gensim/test/test_data/mihalcea_tarau.summ.txt deleted file mode 100644 index 58554ec78c..0000000000 --- a/gensim/test/test_data/mihalcea_tarau.summ.txt +++ /dev/null @@ -1,4 +0,0 @@ -Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas. -The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo. -The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm. -Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast. \ No newline at end of file diff --git a/gensim/test/test_data/mihalcea_tarau.txt b/gensim/test/test_data/mihalcea_tarau.txt deleted file mode 100644 index 6c90af2556..0000000000 --- a/gensim/test/test_data/mihalcea_tarau.txt +++ /dev/null @@ -1,24 +0,0 @@ -AP880911-0016 -AP-NR-09-11-88 0423EDT r i -BC-HurricaneGilbert 09-11 0339 -BC-Hurricane Gilbert,0348 -Hurricane Gilbert Heads Toward Dominican Coast -By RUDDY GONZALEZ -Associated Press Writer -SANTO DOMINGO, Dominican Republic (AP) -Hurricane Gilbert swept toward the Dominican Republic Sunday, and the Civil Defense alerted its heavily populated south coast to prepare for high winds, heavy rains and high seas. -The storm was approaching from the southeast with sustained winds of 75 mph gusting to 92 mph. -``There is no need for alarm,'' Civil Defense Director Eugenio Cabral said in a television alert shortly before midnight Saturday. -Cabral said residents of the province of Barahona should closely follow Gilbert's movement. -An estimated 100,000 people live in the province, including 70,000 in the city of Barahona, about 125 miles west of Santo Domingo. -Tropical Storm Gilbert formed in the eastern Caribbean and strengthened into a hurricane Saturday night. -The National Hurricane Center in Miami reported its position at 2 a.m. Sunday at latitude 16.1 north, longitude 67.5 west, about 140 miles south of Ponce, Puerto Rico, and 200 miles southeast of Santo Domingo. -The National Weather Service in San Juan, Puerto Rico, said Gilbert was moving westward at 15 mph with a ``broad area of cloudiness and heavy weather'' rotating around the center of the storm. -The weather service issued a flash flood watch for Puerto Rico and the Virgin Islands until at least 6 p.m. Sunday. -Strong winds associated with the Gilbert brought coastal flooding, strong southeast winds and up to 12 feet feet to Puerto Rico's south coast. -There were no reports of casualties. -San Juan, on the north coast, had heavy rains and gusts Saturday, but they subsided during the night. -On Saturday, Hurricane Florence was downgraded to a tropical storm and its remnants pushed inland from the U.S. Gulf Coast. -Residents returned home, happy to find little damage from 80 mph winds and sheets of rain. -Florence, the sixth named storm of the 1988 Atlantic storm season, was the second hurricane. -The first, Debby, reached minimal hurricane strength briefly before hitting the Mexican coast last month. \ No newline at end of file diff --git a/gensim/test/test_data/testlowdistinctwords.txt b/gensim/test/test_data/testlowdistinctwords.txt deleted file mode 100644 index 70e20fa3d3..0000000000 --- a/gensim/test/test_data/testlowdistinctwords.txt +++ /dev/null @@ -1,10 +0,0 @@ -here here. -there there. -here here. -there there. -here here. -there there. -here here. -there there. -here here. -there there. \ No newline at end of file diff --git a/gensim/test/test_data/testrepeatedkeywords.txt b/gensim/test/test_data/testrepeatedkeywords.txt deleted file mode 100644 index 76c4386b23..0000000000 --- a/gensim/test/test_data/testrepeatedkeywords.txt +++ /dev/null @@ -1 +0,0 @@ -Victor S. Sage Compare Sage 50c Editions Find accounting software that's right for your business Every product comes with anytime, anywhere online access; automatic updates; access to unlimited support; access to built-in credit card processing and payroll; and advanced reporting. Three solutions for your business 1 user From $249/year Buy now Free Trial 1-5 users From $299/year Buy now Free Trial 3-40 users From $1,199/year Buy now Free Trial Essential Accounting Accounts payable, accounts receivable, cash management check check check open check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check check Advanced Accounting Automated tasks, audit trail, budgeting, change order processing check check open check check check check check check check check check check check check check check check check check check check check check check check check check check check check In-depth Accounting Fast processing, industry-specific features, workflow management check open check check check check check check check check check check check Disclaimers open * This product is backed by a no-risk guarantee for first-time Sage 50 customers. If, within 60 days of purchase, you are not convinced that Sage 50 is the best accounting program for your business, we will refund your money (less and rebate you have received for this purchase). Dated proof of purchase and return of product is required. For details, call 877-481-0341. diff --git a/gensim/test/test_data/testsummarization_unrelated.txt b/gensim/test/test_data/testsummarization_unrelated.txt deleted file mode 100644 index 0a7e0dafbf..0000000000 --- a/gensim/test/test_data/testsummarization_unrelated.txt +++ /dev/null @@ -1,20 +0,0 @@ -River lake island mountain area. -Relay athletics metres freestyle hurdles. -Were court law government police. -Courcelles centimeters mattythewhite wine stamps. -Sysop iran pakistan ali arab. -Copyrighted northamerica rihanna cloudz knowles. -Israel sockpuppet jerusalem palestinian ifk. -Melbourne rovers australian wanderers dinamo. -Film series episode television. -Wrestling chateau ligue discus estonian. -Edits notability archived clearer speedy. -Admins acid molniya chemical compound. -India tamil singh temple kumar. -Bwebs malta hobart basa columella huon. -Rabbi bgwhite lebanese beirut caligari. -German berlin kategorie cross. -System power energy data. -Indonesia malaysia singapore greek jakarta. -Stakes webs futsal whitish thoroughbred racehorse. -Oblast uploaded nordland selsoviet halogaland. \ No newline at end of file diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py deleted file mode 100644 index ffe2f32a8f..0000000000 --- a/gensim/test/test_keywords.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated test to reproduce the results of Mihalcea and Tarau (2004). - -Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm. -As a validation of the gensim implementation we reproduced its results -in this test. - -""" - -import os.path -import logging -import unittest - -from gensim import utils -from gensim.summarization import keywords - - -class TestKeywordsTest(unittest.TestCase): - - def test_text_keywords(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: - text = f.read() - - # calculate keywords - generated_keywords = keywords(text, split=True) - - # To be compared to the reference. - with utils.open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: - kw = f.read().strip().split("\n") - - self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw}) - - def test_text_keywords_words(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: - text = f.read() - - # calculate exactly 13 keywords - generated_keywords = keywords(text, words=15, split=True) - - self.assertEqual(len(generated_keywords), 16) - - def test_text_keywords_pos(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: - text = f.read() - - # calculate keywords using only certain parts of speech - generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) - - # To be compared to the reference. - with utils.open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: - kw = f.read().strip().split("\n") - - self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw}) - - def test_text_summarization_raises_exception_on_short_input_text(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: - text = f.read() - - # Keeps the first 8 sentences to make the text shorter. - text = "\n".join(text.split('\n')[:8]) - - self.assertTrue(keywords(text) is not None) - - def test_keywords_ratio(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: - text = f.read() - - # Check ratio parameter is well behaved. Because length is taken on tokenized clean text - # we just check that ratio 20% is twice as long as ratio 10% - # Values of 10% and 20% were carefully selected for this test to avoid - # numerical instabilities when several keywords have almost the same score - selected_docs_12 = keywords(text, ratio=0.1, split=True) - selected_docs_21 = keywords(text, ratio=0.2, split=True) - - self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1) - - def test_text_keywords_with_small_graph(self): - # regression test, we get graph 2x2 on this text - text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious' - kwds = keywords(text, words=1, split=True) - self.assertTrue(len(kwds)) - - def test_text_keywords_without_graph_edges(self): - # regression test, we get graph with no edges on this text - text = 'Sitio construcción. Estaremos línea.' - kwds = keywords(text, deacc=False, scores=True) - self.assertFalse(len(kwds)) - - def test_keywords_with_words_greater_than_lemmas(self): - # words parameter is greater than number of words in text variable - text = 'Test string small length' - kwds = keywords(text, words=5, split=True) - self.assertIsNotNone(kwds) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py deleted file mode 100644 index c7ef335323..0000000000 --- a/gensim/test/test_summarization.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated test to reproduce the results of Mihalcea and Tarau (2004). - -Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm. -As a validation of the gensim implementation we reproduced its results -in this test. - -""" - -import os.path -import logging -import unittest - -from gensim import utils -from gensim.corpora import Dictionary -from gensim.summarization import summarize, summarize_corpus, keywords, mz_keywords -from gensim.summarization.commons import remove_unreachable_nodes, build_graph -from gensim.summarization.graph import Graph - - -class TestGraph(unittest.TestCase): - - def _build_graph(self): - graph = build_graph(['a', 'b', 'c', 'd']) - graph.add_edge(('a', 'b')) - graph.add_edge(('b', 'c')) - graph.add_edge(('c', 'a')) - return graph - - def test_build_graph(self): - graph = self._build_graph() - - self.assertEqual(sorted(graph.nodes()), ['a', 'b', 'c', 'd']) - self.assertTrue(graph.has_edge(('a', 'b'))) - self.assertTrue(graph.has_edge(('b', 'c'))) - self.assertTrue(graph.has_edge(('c', 'a'))) - - graph = build_graph([]) - self.assertEqual(graph.nodes(), []) - - def test_remove_unreachable_nodes(self): - graph = self._build_graph() - self.assertTrue(graph.has_node('d')) - remove_unreachable_nodes(graph) - self.assertFalse(graph.has_node('d')) - - graph = self._build_graph() - graph.add_edge(('d', 'a'), wt=0.0) - graph.add_edge(('b', 'd'), wt=0) - self.assertTrue(graph.has_node('d')) - remove_unreachable_nodes(graph) - self.assertFalse(graph.has_node('d')) - - def test_graph_nodes(self): - graph = Graph() - - graph.add_node('a') - graph.add_node(1) - graph.add_node('b') - graph.add_node('qwe') - - self.assertTrue(graph.has_node('a')) - self.assertTrue(graph.has_node('b')) - self.assertTrue(graph.has_node('qwe')) - self.assertTrue(graph.has_node(1)) - self.assertFalse(graph.has_node(2)) - - graph.del_node(1) - self.assertEqual(sorted(graph.nodes()), ['a', 'b', 'qwe']) - - def test_graph_edges(self): - graph = Graph() - for node in ('a', 'b', 'c', 'd', 'e', 'foo', 'baz', 'qwe', 'rtyu'): - graph.add_node(node) - - edges = [ - (('a', 'b'), 3.0), - (('c', 'b'), 5.0), - (('d', 'e'), 0.5), - (('a', 'c'), 0.1), - (('foo', 'baz'), 0.11), - (('qwe', 'rtyu'), 0.0), - ] - for edge, weight in edges: - graph.add_edge(edge, weight) - - # check on edge weight first to exclude situation when touching will create an edge - self.assertEqual(graph.edge_weight(('qwe', 'rtyu')), 0.0) - self.assertEqual(graph.edge_weight(('rtyu', 'qwe')), 0.0) - self.assertFalse(graph.has_edge(('qwe', 'rtyu'))) - self.assertFalse(graph.has_edge(('rtyu', 'qwe'))) - - for (u, v), weight in edges: - if weight == 0: - continue - self.assertTrue(graph.has_edge((u, v))) - self.assertTrue(graph.has_edge((v, u))) - - edges_list = [(u, v) for (u, v), w in edges if w] - edges_list.extend((v, u) for (u, v), w in edges if w) - edges_list.sort() - - self.assertEqual(sorted(graph.iter_edges()), edges_list) - - ret_edges = graph.edges() - ret_edges.sort() - self.assertEqual(ret_edges, edges_list) - - for (u, v), weight in edges: - self.assertEqual(graph.edge_weight((u, v)), weight) - self.assertEqual(graph.edge_weight((v, u)), weight) - - self.assertEqual(sorted(graph.neighbors('a')), ['b', 'c']) - self.assertEqual(sorted(graph.neighbors('b')), ['a', 'c']) - self.assertEqual(graph.neighbors('d'), ['e']) - self.assertEqual(graph.neighbors('e'), ['d']) - self.assertEqual(graph.neighbors('foo'), ['baz']) - self.assertEqual(graph.neighbors('baz'), ['foo']) - self.assertEqual(graph.neighbors('foo'), ['baz']) - self.assertEqual(graph.neighbors('qwe'), []) - self.assertEqual(graph.neighbors('rtyu'), []) - - graph.del_edge(('a', 'b')) - self.assertFalse(graph.has_edge(('a', 'b'))) - self.assertFalse(graph.has_edge(('b', 'a'))) - - graph.add_edge(('baz', 'foo'), 0) - self.assertFalse(graph.has_edge(('foo', 'baz'))) - self.assertFalse(graph.has_edge(('baz', 'foo'))) - - graph.del_node('b') - self.assertFalse(graph.has_edge(('b', 'c'))) - self.assertFalse(graph.has_edge(('c', 'b'))) - - -class TestSummarizationTest(unittest.TestCase): - - def _get_text_from_test_data(self, file): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.open(os.path.join(pre_path, file), mode="r") as f: - return f.read() - - def test_text_summarization(self): - text = self._get_text_from_test_data("mihalcea_tarau.txt") - - # Makes a summary of the text. - generated_summary = summarize(text) - - # To be compared to the method reference. - summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt") - - self.assertEqual(generated_summary, summary) - - def test_corpus_summarization(self): - text = self._get_text_from_test_data("mihalcea_tarau.txt") - - # Generate the corpus. - sentences = text.split("\n") - tokens = [sentence.split() for sentence in sentences] - dictionary = Dictionary(tokens) - corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] - - # Extract the most important documents. - selected_documents = summarize_corpus(corpus) - - # They are compared to the method reference. - summary = self._get_text_from_test_data("mihalcea_tarau.summ.txt") - summary = summary.split('\n') - - # Each sentence in the document selection has to be in the model summary. - for doc_number, document in enumerate(selected_documents): - # Retrieves all words from the document. - words = [dictionary[token_id] for (token_id, count) in document] - - # Asserts that all of them are in a sentence from the model reference. - self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary) - - def test_summary_from_unrelated_sentences(self): - # Tests that the summarization of a text with unrelated sentences is not empty string. - text = self._get_text_from_test_data("testsummarization_unrelated.txt") - generated_summary = summarize(text) - self.assertNotEqual(generated_summary, u"") - - def test_text_summarization_on_short_input_text_is_empty_string(self): - text = self._get_text_from_test_data("testsummarization_unrelated.txt") - - # Keeps the first 8 sentences to make the text shorter. - text = "\n".join(text.split('\n')[:8]) - - self.assertNotEqual(summarize(text), u"") - - def test_text_summarization_raises_exception_on_single_input_sentence(self): - text = self._get_text_from_test_data("testsummarization_unrelated.txt") - - # Keeps the first sentence only. - text = text.split('\n')[0] - - self.assertRaises(ValueError, summarize, text) - - def test_corpus_summarization_is_not_empty_list_on_short_input_text(self): - text = self._get_text_from_test_data("testsummarization_unrelated.txt") - - # Keeps the first 8 sentences to make the text shorter. - sentences = text.split('\n')[:8] - - # Generate the corpus. - tokens = [sentence.split() for sentence in sentences] - dictionary = Dictionary(tokens) - corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] - - self.assertNotEqual(summarize_corpus(corpus), []) - - def test_empty_text_summarization_is_empty_string(self): - self.assertEqual(summarize(""), u"") - - def test_empty_text_summarization_with_split_is_empty_list(self): - self.assertEqual(summarize("", split=True), []) - - def test_empty_corpus_summarization_is_empty_list(self): - self.assertEqual(summarize_corpus([]), []) - - def test_corpus_summarization_ratio(self): - text = self._get_text_from_test_data("mihalcea_tarau.txt") - - # Generate the corpus. - sentences = text.split('\n') - tokens = [sentence.split() for sentence in sentences] - dictionary = Dictionary(tokens) - corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] - - # Makes summaries of the text using different ratio parameters. - for x in range(1, 10): - ratio = x / float(10) - selected_docs = summarize_corpus(corpus, ratio=ratio) - expected_summary_length = int(len(corpus) * ratio) - - self.assertEqual(len(selected_docs), expected_summary_length) - - def test_repeated_keywords(self): - text = self._get_text_from_test_data("testrepeatedkeywords.txt") - - kwds = keywords(text) - self.assertTrue(len(kwds.splitlines())) - - kwds_u = keywords(utils.to_unicode(text)) - self.assertTrue(len(kwds_u.splitlines())) - - kwds_lst = keywords(text, split=True) - self.assertTrue(len(kwds_lst)) - - def test_keywords_runs(self): - text = self._get_text_from_test_data("mihalcea_tarau.txt") - - kwds = keywords(text) - self.assertTrue(len(kwds.splitlines())) - - kwds_u = keywords(utils.to_unicode(text)) - self.assertTrue(len(kwds_u.splitlines())) - - kwds_lst = keywords(text, split=True) - self.assertTrue(len(kwds_lst)) - - def test_mz_keywords(self): - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - - with utils.open(os.path.join(pre_path, "head500.noblanks.cor"), 'rb') as f: - text = utils.to_unicode(f.read()) - text = u' '.join(text.split()[:10240]) - kwds = mz_keywords(text) - self.assertTrue(kwds.startswith('autism')) - self.assertTrue(kwds.endswith('uk')) - self.assertTrue(len(kwds.splitlines())) - - kwds_lst = mz_keywords(text, split=True) - self.assertTrue(len(kwds_lst)) - # Automatic thresholding selects words with n_blocks / n_blocks+1 - # bits of entropy. For this text, n_blocks=10 - n_blocks = 10. - kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') - self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.))) - - def test_low_distinct_words_corpus_summarization_is_empty_list(self): - text = self._get_text_from_test_data("testlowdistinctwords.txt") - - # Generate the corpus. - sentences = text.split("\n") - tokens = [sentence.split() for sentence in sentences] - dictionary = Dictionary(tokens) - corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] - - self.assertEqual(summarize_corpus(corpus), []) - - def test_low_distinct_words_summarization_is_empty_string(self): - text = self._get_text_from_test_data("testlowdistinctwords.txt") - self.assertEqual(summarize(text), u"") - - def test_low_distinct_words_summarization_with_split_is_empty_list(self): - text = self._get_text_from_test_data("testlowdistinctwords.txt") - self.assertEqual(summarize(text, split=True), []) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main()