piskvorky · mpenkov · May 25, 2023 · Mar 18, 2023 · Mar 18, 2023 · Mar 18, 2023
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
@@ -0,0 +1,68 @@
+#
+# This workflow rebuilds documentation and stores the resulting patch as a
+# workflow artifact.  We can then download the artifact, apply the patch, and
+# then push the changes.
+#
+# It's possible to do all this locally on a developer's machine, but it's not
+# trivial, because it requires many pre-requisites.
+#
+name: Rebuild documentation
+on: workflow_dispatch
+jobs:
+  docs:
+    name: Rebuild documentation
+    timeout-minutes: 180
+    runs-on: ubuntu-20.04
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup up Python ${{ matrix.python }}
+        uses: actions/setup-python@v4
+        with:
+          #
+          # We use Py3.8 here for historical reasons.
+          #
+          python-version: "3.8"
+
+      - name: Update pip
+        run: python -m pip install -U pip
+
+      - name: Install apt packages for LaTeX rendering
+        run: |
+          sudo apt-get -yq update
+          sudo apt-get -yq remove texlive-binaries --purge
+          sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk
+          sudo apt-get -yq install build-essential python3.8-dev
+      - name: Install gensim and its dependencies
+        run: pip install -e .[docs]
+
+      - name: Build documentation
+        run: |
+          python setup.py build_ext --inplace
+          make -C docs/src clean html
+
+      - name: Check changes to prebuilt docs
+        run: |
+          git config user.email "[email protected]"
+          git config user.name "Gensim Docs Build"
+          if ! git diff --quiet @ ; then
+            git add .
+            branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)"
+            git commit -m "Import rebuilt documentation for branch $branch"
+            git format-patch @^
+            git bundle create prebuilt-docs-changes.bundle @^...@
+            git reset --mixed @^
+            git diff --exit-code --stat @
+          fi
+
+      - name: Upload prebuilt docs changes
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: prebuilt-docs-changes
+          path: |
+            *.patch
+            *.bundle
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -51,10 +51,6 @@ jobs:
         run: |
           python setup.py build_ext --inplace
           make -C docs/src clean html
-      #
-      # FIXME: do we want to store the built documentation somewhere, or is
-      # knowing that the docs built successfully enough?
-      #
 
   tests:
     name: test ${{ matrix.os }} python ${{ matrix.python }}

diff --git a/.gitignore b/.gitignore
@@ -75,6 +75,7 @@ data
 *.inv
 *.js
 docs/_images/
+docs/_downloads/
 
 #
 # Generated by Cython

diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py
@@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
        return fname
 
     # Download the file to local storage first.
-    with smart_open.open(url, "rb", ignore_ext=True) as fin:
-        with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
+    try:
+        kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
+        fin = smart_open.open(url, "rb", **kwargs)
+    except (AttributeError, TypeError):
+        kwargs = { 'ignore_ext': True }
+        fin = smart_open.open(url, "rb", **kwargs)
+    if fin:
+        with smart_open.open(fname, 'wb', **kwargs) as fout:
             while True:
                 buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                 if not buf:
                     break
                 fout.write(buf)
+        fin.close()
 
     return fname
 

diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5
@@ -1 +1 @@
-7020ef8545a05962fe2d7146b4b95f11
+507b6c07ce76db341761559a96daa17d
diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py
@@ -29,6 +29,9 @@
 from gensim.corpora import Dictionary
 from nltk.stem.wordnet import WordNetLemmatizer
 
+from nltk import download
+download('wordnet')
+
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
 

diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
@@ -1 +1 @@
-be0c32b18644ebb1a7826764b37ebc01
+9f666b02b1eeac820f2a2200e6d14f6e
diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py
@@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
 # easy to read is very desirable in topic modelling.
 #
 
+# Download the WordNet data
+from nltk import download
+download('wordnet')
+
 # Lemmatize the documents.
 from nltk.stem.wordnet import WordNetLemmatizer
 

diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5
@@ -1 +1 @@
-0995a15406049093d95974700d471876
+802d286d0c620260af50bf5ef0e08253
diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py
@@ -20,13 +20,18 @@ def different(path1, path2):
 
 
 curr_dir = os.path.dirname(__file__)
+docs_dir = os.path.dirname(curr_dir)
+src_dir = os.path.dirname(docs_dir)
 stale = []
 for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')):
     for f in files:
         if f.endswith('.py'):
             source_path = os.path.join(root, f)
             cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/')
 
+            rel_source_path = os.path.relpath(source_path, src_dir)
+            rel_cache_path = os.path.relpath(cache_path, src_dir)
+
             #
             # We check two things:
             #
@@ -40,7 +45,7 @@ def different(path1, path2):
             # but we run them both because it's trivial.
             #
             if different(source_path, cache_path):
-                stale.append(cache_path)
+                stale.append(f"{rel_source_path} != {rel_cache_path}")
                 continue
 
             actual_md5 = hashlib.md5()
@@ -52,9 +57,10 @@ def different(path1, path2):
                 expected_md5 = fin.read()
 
             if actual_md5.hexdigest() != expected_md5:
-                stale.append(cache_path)
+                stale.append(f"{rel_source_path} md5 != {rel_cache_path}.md5")
 
 if stale:
+    stale = '\n'.join(stale)
     print(f"""The gallery cache appears stale.
 
 Rebuild the documentation using the following commands from the gensim root subdirectory:
@@ -64,6 +70,7 @@ def different(path1, path2):
 
 and then run `git add docs/src/auto_examples` to update the cache.
 
-Stale files: {stale}
+Stale files:
+{stale}
 """, file=sys.stderr)
     sys.exit(1)
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -254,10 +254,11 @@ def sort_key(source_dir):
         'run_word2vec.py',
         'run_doc2vec_lee.py',
         'run_fasttext.py',
+        'run_ensemblelda.py',
         'run_annoy.py',
         'run_lda.py',
         'run_wmd.py',
-        'run_summarization.py',
+        'run_scm.py',
     ]
 
     howto_order = [

diff --git a/docs/src/gallery/howtos/run_doc2vec_imdb.py b/docs/src/gallery/howtos/run_doc2vec_imdb.py
@@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
        return fname
 
     # Download the file to local storage first.
-    with smart_open.open(url, "rb", ignore_ext=True) as fin:
-        with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
+    try:
+        kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
+        fin = smart_open.open(url, "rb", **kwargs)
+    except (AttributeError, TypeError):
+        kwargs = { 'ignore_ext': True }
+        fin = smart_open.open(url, "rb", **kwargs)
+    if fin:
+        with smart_open.open(fname, 'wb', **kwargs) as fout:
             while True:
                 buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                 if not buf:
                     break
                 fout.write(buf)
+        fin.close()
 
     return fname
 

diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py
@@ -29,6 +29,9 @@
 from gensim.corpora import Dictionary
 from nltk.stem.wordnet import WordNetLemmatizer
 
+from nltk import download
+download('wordnet')
+
 lemmatizer = WordNetLemmatizer()
 docs = api.load('text8')
 

diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py
@@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
 # easy to read is very desirable in topic modelling.
 #
 
+# Download the WordNet data
+from nltk import download
+download('wordnet')
+
 # Lemmatize the documents.
 from nltk.stem.wordnet import WordNetLemmatizer
 

diff --git a/setup.py b/setup.py
@@ -323,6 +323,7 @@ def run(self):
     'memory_profiler',
     'annoy',
     'Pyro4',
+    'scikit-learn',
     'nltk',
     'testfixtures',
     'statsmodels',
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,6 +75,7 @@ data @@
     *.inv
     *.js
     docs/_images/
+    docs/_downloads/
     #
     # Generated by Cython
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		7020ef8545a05962fe2d7146b4b95f11
		507b6c07ce76db341761559a96daa17d
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		be0c32b18644ebb1a7826764b37ebc01
		9f666b02b1eeac820f2a2200e6d14f6e
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		0995a15406049093d95974700d471876
		802d286d0c620260af50bf5ef0e08253