Doc fixes and separate workflow for building docs via CI (#3462)

* Use new smart_open compression parameter instead of ignore_ext when possible Keep compatibility with old versions of smart_open just in case. 1.8.1 is required by the deps, 5.1.0 introduced the compression parameter, 6.0.0 dropped the ignore_ext parameter. * Download the NLTK WordNet data before using the WordNet lemmatizer Fixes the docs build for the run_lda.py and run_ensemblelda.py tutorials. * Add missing documentation build dependency scikit-learn The run_compare_lda.py howto and run_word2vec.py tutorial import it. It was removed from docs_testenv when the scikit-learn wrapper was removed. Fixes: commit a21d9cc * Update the order in which the tutorials will be placed Add the ensemblelda and scm tutorials added in 2021. Remove the summarization tutorial as it was removed in 2020. Use the order from the existing prebuilt docs files. Without a defined order they will be placed non-deterministically, which means commits not changing docs will change prebuilt docs. Fixes: commit 76579b3 Fixes: commit ddeeb12 Fixes: commit 2dcaaf8 * Improve the display of the stale prebuilt docs files Print the .md5 file when it is the stale file. Print the source path for each stale file. Print paths relative to the source tree. Print only one stale file pair per line. * Upload the changes to the generated docs to GitHub artifacts Building the docs often takes too long locally, so this allows pull request submitters to build on GitHub, download the changes and incorporate the changes to the generated docs in a commit, then update their pull request with the generated docs commit. Also check that the changes to the prebuilt docs are committed, except for docs that change for every single rebuild. * Update duplicate copies of howtos and tutorials * move action to separate workflow --------- Co-authored-by: Michael Penkov <[email protected]>
piskvorky · May 25, 2023 · 3ae286e · 3ae286e
1 parent eb98bf3
commit 3ae286e
Show file tree

Hide file tree

Showing 15 changed files with 117 additions and 15 deletions.
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
@@ -0,0 +1,68 @@
+#
+# This workflow rebuilds documentation and stores the resulting patch as a
+# workflow artifact.  We can then download the artifact, apply the patch, and
+# then push the changes.
+#
+# It's possible to do all this locally on a developer's machine, but it's not
+# trivial, because it requires many pre-requisites.
+#
+name: Rebuild documentation
+on: workflow_dispatch
+jobs:
+  docs:
+    name: Rebuild documentation
+    timeout-minutes: 180
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup up Python ${{ matrix.python }}
+        uses: actions/setup-python@v4
+        with:
+          #
+          # We use Py3.8 here for historical reasons.
+          #
+          python-version: "3.8"
+
+      - name: Update pip
+        run: python -m pip install -U pip
+
+      - name: Install apt packages for LaTeX rendering
+        run: |
+          sudo apt-get -yq update
+          sudo apt-get -yq remove texlive-binaries --purge
+          sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk
+          sudo apt-get -yq install build-essential python3.8-dev
+      - name: Install gensim and its dependencies
+        run: pip install -e .[docs]
+
+      - name: Build documentation
+        run: |
+          python setup.py build_ext --inplace
+          make -C docs/src clean html
+
+      - name: Check changes to prebuilt docs
+        run: |
+          git config user.email "[email protected]"
+          git config user.name "Gensim Docs Build"
+          if ! git diff --quiet @ ; then
+            git add .
+            branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)"
+            git commit -m "Import rebuilt documentation for branch $branch"
+            git format-patch @^
+            git bundle create prebuilt-docs-changes.bundle @^...@
+            git reset --mixed @^
+            git diff --exit-code --stat @
+          fi
+
+      - name: Upload prebuilt docs changes
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: prebuilt-docs-changes
+          path: |
+            *.patch
+            *.bundle
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -51,10 +51,6 @@ jobs:
         run: |
           python setup.py build_ext --inplace
           make -C docs/src clean html
-      #
-      # FIXME: do we want to store the built documentation somewhere, or is
-      # knowing that the docs built successfully enough?
-      #
 
   tests:
     name: test ${{ matrix.os }} python ${{ matrix.python }}

diff --git a/.gitignore b/.gitignore
@@ -75,6 +75,7 @@ data
 *.inv
 *.js
 docs/_images/
+docs/_downloads/
 
 #
 # Generated by Cython

diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py
@@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
        return fname
 
     # Download the file to local storage first.
-    with smart_open.open(url, "rb", ignore_ext=True) as fin:
-        with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
+    try:
+        kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
+        fin = smart_open.open(url, "rb", **kwargs)
+    except (AttributeError, TypeError):
+        kwargs = { 'ignore_ext': True }
+        fin = smart_open.open(url, "rb", **kwargs)
+    if fin:
+        with smart_open.open(fname, 'wb', **kwargs) as fout:
             while True:
                 buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                 if not buf:
                     break
                 fout.write(buf)
+        fin.close()
 
     return fname
 

diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5
@@ -1 +1 @@
-7020ef8545a05962fe2d7146b4b95f11
+507b6c07ce76db341761559a96daa17d
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py
@@ -29,6 +29,9 @@
 from gensim.corpora import Dictionary
 from nltk.stem.wordnet import WordNetLemmatizer
 
+from nltk import download
+download('wordnet')
+
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
 

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
@@ -1 +1 @@
-be0c32b18644ebb1a7826764b37ebc01
+9f666b02b1eeac820f2a2200e6d14f6e
diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py
@@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
 # easy to read is very desirable in topic modelling.
 #
 
+# Download the WordNet data
+from nltk import download
+download('wordnet')
+
 # Lemmatize the documents.
 from nltk.stem.wordnet import WordNetLemmatizer
 

diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5
@@ -1 +1 @@
-0995a15406049093d95974700d471876
+802d286d0c620260af50bf5ef0e08253
diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py
@@ -20,13 +20,18 @@ def different(path1, path2):
 
 
 curr_dir = os.path.dirname(__file__)
+docs_dir = os.path.dirname(curr_dir)
+src_dir = os.path.dirname(docs_dir)
 stale = []
 for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')):
     for f in files:
         if f.endswith('.py'):
             source_path = os.path.join(root, f)
             cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/')
 
+            rel_source_path = os.path.relpath(source_path, src_dir)
+            rel_cache_path = os.path.relpath(cache_path, src_dir)
+
             #
             # We check two things:
             #
@@ -40,7 +45,7 @@ def different(path1, path2):
             # but we run them both because it's trivial.
             #
             if different(source_path, cache_path):
-                stale.append(cache_path)
+                stale.append(f"{rel_source_path} != {rel_cache_path}")
                 continue
 
             actual_md5 = hashlib.md5()
@@ -52,9 +57,10 @@ def different(path1, path2):
                 expected_md5 = fin.read()
 
             if actual_md5.hexdigest() != expected_md5:
-                stale.append(cache_path)
+                stale.append(f"{rel_source_path} md5 != {rel_cache_path}.md5")
 
 if stale:
+    stale = '\n'.join(stale)
     print(f"""The gallery cache appears stale.
 
 Rebuild the documentation using the following commands from the gensim root subdirectory:
@@ -64,6 +70,7 @@ def different(path1, path2):
 
 and then run `git add docs/src/auto_examples` to update the cache.
 
-Stale files: {stale}
+Stale files:
+{stale}
 """, file=sys.stderr)
     sys.exit(1)
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -254,10 +254,11 @@ def sort_key(source_dir):
         'run_word2vec.py',
         'run_doc2vec_lee.py',
         'run_fasttext.py',
+        'run_ensemblelda.py',
         'run_annoy.py',
         'run_lda.py',
         'run_wmd.py',
-        'run_summarization.py',
+        'run_scm.py',
     ]
 
     howto_order = [

diff --git a/docs/src/gallery/howtos/run_doc2vec_imdb.py b/docs/src/gallery/howtos/run_doc2vec_imdb.py
@@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
        return fname
 
     # Download the file to local storage first.
-    with smart_open.open(url, "rb", ignore_ext=True) as fin:
-        with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
+    try:
+        kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
+        fin = smart_open.open(url, "rb", **kwargs)
+    except (AttributeError, TypeError):
+        kwargs = { 'ignore_ext': True }
+        fin = smart_open.open(url, "rb", **kwargs)
+    if fin:
+        with smart_open.open(fname, 'wb', **kwargs) as fout:
             while True:
                 buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                 if not buf:
                     break
                 fout.write(buf)
+        fin.close()
 
     return fname
 

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -29,6 +29,9 @@
 from gensim.corpora import Dictionary
 from nltk.stem.wordnet import WordNetLemmatizer
 
+from nltk import download
+download('wordnet')
+
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
 

diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py
@@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
 # easy to read is very desirable in topic modelling.
 #
 
+# Download the WordNet data
+from nltk import download
+download('wordnet')
+
 # Lemmatize the documents.
 from nltk.stem.wordnet import WordNetLemmatizer
 

diff --git a/setup.py b/setup.py
@@ -323,6 +323,7 @@ def run(self):
     'memory_profiler',
     'annoy',
     'Pyro4',
+    'scikit-learn',
     'nltk',
     'testfixtures',
     'statsmodels',
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,6 +75,7 @@ data @@
     *.inv
     *.js
     docs/_images/
+    docs/_downloads/
     #
     # Generated by Cython
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		7020ef8545a05962fe2d7146b4b95f11
		507b6c07ce76db341761559a96daa17d
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		be0c32b18644ebb1a7826764b37ebc01
		9f666b02b1eeac820f2a2200e6d14f6e
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		0995a15406049093d95974700d471876
		802d286d0c620260af50bf5ef0e08253