From c6f8c38fb9a38f7c75ff494dd125af756de650c1 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 12:57:31 +0800 Subject: [PATCH 1/8] Use new smart_open compression parameter instead of ignore_ext when possible Keep compatibility with old versions of smart_open just in case. 1.8.1 is required by the deps, 5.1.0 introduced the compression parameter, 6.0.0 dropped the ignore_ext parameter. --- docs/src/gallery/howtos/run_doc2vec_imdb.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/src/gallery/howtos/run_doc2vec_imdb.py b/docs/src/gallery/howtos/run_doc2vec_imdb.py index 36e19ce898..6acca81d65 100644 --- a/docs/src/gallery/howtos/run_doc2vec_imdb.py +++ b/docs/src/gallery/howtos/run_doc2vec_imdb.py @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v return fname # Download the file to local storage first. - with smart_open.open(url, "rb", ignore_ext=True) as fin: - with smart_open.open(fname, 'wb', ignore_ext=True) as fout: + try: + kwargs = { 'compression': smart_open.compression.NO_COMPRESSION } + fin = smart_open.open(url, "rb", **kwargs) + except (AttributeError, TypeError): + kwargs = { 'ignore_ext': True } + fin = smart_open.open(url, "rb", **kwargs) + if fin: + with smart_open.open(fname, 'wb', **kwargs) as fout: while True: buf = fin.read(io.DEFAULT_BUFFER_SIZE) if not buf: break fout.write(buf) + fin.close() return fname From 8844a23ef8467c3af0eb968749f2a41ca7082f84 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 11:43:23 +0800 Subject: [PATCH 2/8] Download the NLTK WordNet data before using the WordNet lemmatizer Fixes the docs build for the run_lda.py and run_ensemblelda.py tutorials. --- docs/src/gallery/tutorials/run_ensemblelda.py | 3 +++ docs/src/gallery/tutorials/run_lda.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index aa87d0ecd3..1cb34ec17e 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -29,6 +29,9 @@ from gensim.corpora import Dictionary from nltk.stem.wordnet import WordNetLemmatizer +from nltk import download +download('wordnet') + lemmatizer = WordNetLemmatizer() docs = api.load('text8') diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py index 74956619a9..85c3aa6aca 100644 --- a/docs/src/gallery/tutorials/run_lda.py +++ b/docs/src/gallery/tutorials/run_lda.py @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # easy to read is very desirable in topic modelling. # +# Download the WordNet data +from nltk import download +download('wordnet') + # Lemmatize the documents. from nltk.stem.wordnet import WordNetLemmatizer From 2249d56f7dc9c0d65f322528aca8312a097775f4 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 11:13:07 +0800 Subject: [PATCH 3/8] Add missing documentation build dependency scikit-learn The run_compare_lda.py howto and run_word2vec.py tutorial import it. It was removed from docs_testenv when the scikit-learn wrapper was removed. Fixes: commit a21d9cc768598640f38e4bd03d368f8712a9aa77 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 314ee87711..538fa681cd 100644 --- a/setup.py +++ b/setup.py @@ -323,6 +323,7 @@ def run(self): 'memory_profiler', 'annoy', 'Pyro4', + 'scikit-learn', 'nltk', 'testfixtures', 'statsmodels', From 8f199af7d9e0f0360842641e2998e8ea4a415203 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sun, 19 Mar 2023 13:03:45 +0800 Subject: [PATCH 4/8] Update the order in which the tutorials will be placed Add the ensemblelda and scm tutorials added in 2021. Remove the summarization tutorial as it was removed in 2020. Use the order from the existing prebuilt docs files. Without a defined order they will be placed non-deterministically, which means commits not changing docs will change prebuilt docs. Fixes: commit 76579b3fd33d6a59fb397e8d101bb4326951afa8 Fixes: commit ddeeb1274c996b6992e11f7cbe7ca44c776f6daf Fixes: commit 2dcaaf80f4fb8023acc2f118b0966d92fca9500e --- docs/src/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index da943a98f9..a1d94b812d 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -254,10 +254,11 @@ def sort_key(source_dir): 'run_word2vec.py', 'run_doc2vec_lee.py', 'run_fasttext.py', + 'run_ensemblelda.py', 'run_annoy.py', 'run_lda.py', 'run_wmd.py', - 'run_summarization.py', + 'run_scm.py', ] howto_order = [ From 1baa096cf98f0c9c5bfaeb6e066e68fd54543611 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 15:25:09 +0800 Subject: [PATCH 5/8] Improve the display of the stale prebuilt docs files Print the .md5 file when it is the stale file. Print the source path for each stale file. Print paths relative to the source tree. Print only one stale file pair per line. --- docs/src/check_gallery.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py index d03726dabb..dc4c9d0bdf 100644 --- a/docs/src/check_gallery.py +++ b/docs/src/check_gallery.py @@ -20,6 +20,8 @@ def different(path1, path2): curr_dir = os.path.dirname(__file__) +docs_dir = os.path.dirname(curr_dir) +src_dir = os.path.dirname(docs_dir) stale = [] for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')): for f in files: @@ -27,6 +29,9 @@ def different(path1, path2): source_path = os.path.join(root, f) cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/') + rel_source_path = os.path.relpath(source_path, src_dir) + rel_cache_path = os.path.relpath(cache_path, src_dir) + # # We check two things: # @@ -40,7 +45,7 @@ def different(path1, path2): # but we run them both because it's trivial. # if different(source_path, cache_path): - stale.append(cache_path) + stale.append(f"{rel_source_path} != {rel_cache_path}") continue actual_md5 = hashlib.md5() @@ -52,9 +57,10 @@ def different(path1, path2): expected_md5 = fin.read() if actual_md5.hexdigest() != expected_md5: - stale.append(cache_path) + stale.append(f"{rel_source_path} md5 != {rel_cache_path}.md5") if stale: + stale = '\n'.join(stale) print(f"""The gallery cache appears stale. Rebuild the documentation using the following commands from the gensim root subdirectory: @@ -64,6 +70,7 @@ def different(path1, path2): and then run `git add docs/src/auto_examples` to update the cache. -Stale files: {stale} +Stale files: +{stale} """, file=sys.stderr) sys.exit(1) From efc6f323d2b714596fc735440a10545ba18d254b Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 09:51:13 +0800 Subject: [PATCH 6/8] Upload the changes to the generated docs to GitHub artifacts Building the docs often takes too long locally, so this allows pull request submitters to build on GitHub, download the changes and incorporate the changes to the generated docs in a commit, then update their pull request with the generated docs commit. Also check that the changes to the prebuilt docs are committed, except for docs that change for every single rebuild. --- .github/workflows/tests.yml | 23 +++++++++++++++++++++++ .gitignore | 1 + 2 files changed, 24 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b63b5d89b2..c5a71c31a7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,6 +51,29 @@ jobs: run: | python setup.py build_ext --inplace make -C docs/src clean html + + - name: Check changes to prebuilt docs + run: | + git config user.email "noreply@github.com" + git config user.name "Gensim Docs Build" + if ! git diff --quiet @ ; then + git add . + branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)" + git commit -m "Import rebuilt documentation for branch $branch" + git format-patch @^ + git bundle create prebuilt-docs-changes.bundle @^...@ + git reset --mixed @^ + git diff --exit-code --stat @ + fi + + - name: Upload prebuilt docs changes + if: always() + uses: actions/upload-artifact@v3 + with: + name: prebuilt-docs-changes + path: | + *.patch + *.bundle # # FIXME: do we want to store the built documentation somewhere, or is # knowing that the docs built successfully enough? diff --git a/.gitignore b/.gitignore index 7ebc73745f..95e8669a28 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ data *.inv *.js docs/_images/ +docs/_downloads/ # # Generated by Cython From 1788ffdf9c49332bfca29b27e47fdbfb1ce7d46c Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 18 Mar 2023 15:15:05 +0800 Subject: [PATCH 7/8] Update duplicate copies of howtos and tutorials --- docs/src/auto_examples/howtos/run_doc2vec_imdb.py | 11 +++++++++-- docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 | 2 +- docs/src/auto_examples/tutorials/run_ensemblelda.py | 3 +++ .../auto_examples/tutorials/run_ensemblelda.py.md5 | 2 +- docs/src/auto_examples/tutorials/run_lda.py | 4 ++++ docs/src/auto_examples/tutorials/run_lda.py.md5 | 2 +- 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py index 36e19ce898..6acca81d65 100644 --- a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py +++ b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v return fname # Download the file to local storage first. - with smart_open.open(url, "rb", ignore_ext=True) as fin: - with smart_open.open(fname, 'wb', ignore_ext=True) as fout: + try: + kwargs = { 'compression': smart_open.compression.NO_COMPRESSION } + fin = smart_open.open(url, "rb", **kwargs) + except (AttributeError, TypeError): + kwargs = { 'ignore_ext': True } + fin = smart_open.open(url, "rb", **kwargs) + if fin: + with smart_open.open(fname, 'wb', **kwargs) as fout: while True: buf = fin.read(io.DEFAULT_BUFFER_SIZE) if not buf: break fout.write(buf) + fin.close() return fname diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 index 7de0204a7c..b2b072c7e3 100644 --- a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 +++ b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 @@ -1 +1 @@ -7020ef8545a05962fe2d7146b4b95f11 \ No newline at end of file +507b6c07ce76db341761559a96daa17d \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py index aa87d0ecd3..1cb34ec17e 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py @@ -29,6 +29,9 @@ from gensim.corpora import Dictionary from nltk.stem.wordnet import WordNetLemmatizer +from nltk import download +download('wordnet') + lemmatizer = WordNetLemmatizer() docs = api.load('text8') diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 index f09f123fba..620d90e6ee 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 @@ -1 +1 @@ -be0c32b18644ebb1a7826764b37ebc01 \ No newline at end of file +9f666b02b1eeac820f2a2200e6d14f6e \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py index 74956619a9..85c3aa6aca 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py +++ b/docs/src/auto_examples/tutorials/run_lda.py @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # easy to read is very desirable in topic modelling. # +# Download the WordNet data +from nltk import download +download('wordnet') + # Lemmatize the documents. from nltk.stem.wordnet import WordNetLemmatizer diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5 index 515f059550..7bb8b3eb9a 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_lda.py.md5 @@ -1 +1 @@ -0995a15406049093d95974700d471876 \ No newline at end of file +802d286d0c620260af50bf5ef0e08253 \ No newline at end of file From e684948185c58773658c88665fac3807f2b708b2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 21 May 2023 09:08:22 +0900 Subject: [PATCH 8/8] move action to separate workflow --- .github/workflows/build-docs.yml | 68 ++++++++++++++++++++++++++++++++ .github/workflows/tests.yml | 27 ------------- 2 files changed, 68 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/build-docs.yml diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 0000000000..f13a773d2e --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,68 @@ +# +# This workflow rebuilds documentation and stores the resulting patch as a +# workflow artifact. We can then download the artifact, apply the patch, and +# then push the changes. +# +# It's possible to do all this locally on a developer's machine, but it's not +# trivial, because it requires many pre-requisites. +# +name: Rebuild documentation +on: workflow_dispatch +jobs: + docs: + name: Rebuild documentation + timeout-minutes: 180 + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + + steps: + - uses: actions/checkout@v3 + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + # + # We use Py3.8 here for historical reasons. + # + python-version: "3.8" + + - name: Update pip + run: python -m pip install -U pip + + - name: Install apt packages for LaTeX rendering + run: | + sudo apt-get -yq update + sudo apt-get -yq remove texlive-binaries --purge + sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk + sudo apt-get -yq install build-essential python3.8-dev + - name: Install gensim and its dependencies + run: pip install -e .[docs] + + - name: Build documentation + run: | + python setup.py build_ext --inplace + make -C docs/src clean html + + - name: Check changes to prebuilt docs + run: | + git config user.email "noreply@github.com" + git config user.name "Gensim Docs Build" + if ! git diff --quiet @ ; then + git add . + branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)" + git commit -m "Import rebuilt documentation for branch $branch" + git format-patch @^ + git bundle create prebuilt-docs-changes.bundle @^...@ + git reset --mixed @^ + git diff --exit-code --stat @ + fi + + - name: Upload prebuilt docs changes + if: always() + uses: actions/upload-artifact@v3 + with: + name: prebuilt-docs-changes + path: | + *.patch + *.bundle diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c5a71c31a7..b176c910a8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -52,33 +52,6 @@ jobs: python setup.py build_ext --inplace make -C docs/src clean html - - name: Check changes to prebuilt docs - run: | - git config user.email "noreply@github.com" - git config user.name "Gensim Docs Build" - if ! git diff --quiet @ ; then - git add . - branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)" - git commit -m "Import rebuilt documentation for branch $branch" - git format-patch @^ - git bundle create prebuilt-docs-changes.bundle @^...@ - git reset --mixed @^ - git diff --exit-code --stat @ - fi - - - name: Upload prebuilt docs changes - if: always() - uses: actions/upload-artifact@v3 - with: - name: prebuilt-docs-changes - path: | - *.patch - *.bundle - # - # FIXME: do we want to store the built documentation somewhere, or is - # knowing that the docs built successfully enough? - # - tests: name: test ${{ matrix.os }} python ${{ matrix.python }} timeout-minutes: 30