diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml new file mode 100644 index 0000000000..f13a773d2e --- /dev/null +++ b/.github/workflows/build-docs.yml @@ -0,0 +1,68 @@ +# +# This workflow rebuilds documentation and stores the resulting patch as a +# workflow artifact. We can then download the artifact, apply the patch, and +# then push the changes. +# +# It's possible to do all this locally on a developer's machine, but it's not +# trivial, because it requires many pre-requisites. +# +name: Rebuild documentation +on: workflow_dispatch +jobs: + docs: + name: Rebuild documentation + timeout-minutes: 180 + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + + steps: + - uses: actions/checkout@v3 + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + # + # We use Py3.8 here for historical reasons. + # + python-version: "3.8" + + - name: Update pip + run: python -m pip install -U pip + + - name: Install apt packages for LaTeX rendering + run: | + sudo apt-get -yq update + sudo apt-get -yq remove texlive-binaries --purge + sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk + sudo apt-get -yq install build-essential python3.8-dev + - name: Install gensim and its dependencies + run: pip install -e .[docs] + + - name: Build documentation + run: | + python setup.py build_ext --inplace + make -C docs/src clean html + + - name: Check changes to prebuilt docs + run: | + git config user.email "noreply@github.com" + git config user.name "Gensim Docs Build" + if ! git diff --quiet @ ; then + git add . + branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)" + git commit -m "Import rebuilt documentation for branch $branch" + git format-patch @^ + git bundle create prebuilt-docs-changes.bundle @^...@ + git reset --mixed @^ + git diff --exit-code --stat @ + fi + + - name: Upload prebuilt docs changes + if: always() + uses: actions/upload-artifact@v3 + with: + name: prebuilt-docs-changes + path: | + *.patch + *.bundle diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b63b5d89b2..b176c910a8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -51,10 +51,6 @@ jobs: run: | python setup.py build_ext --inplace make -C docs/src clean html - # - # FIXME: do we want to store the built documentation somewhere, or is - # knowing that the docs built successfully enough? - # tests: name: test ${{ matrix.os }} python ${{ matrix.python }} diff --git a/.gitignore b/.gitignore index 7ebc73745f..95e8669a28 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ data *.inv *.js docs/_images/ +docs/_downloads/ # # Generated by Cython diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py index 36e19ce898..6acca81d65 100644 --- a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py +++ b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v return fname # Download the file to local storage first. - with smart_open.open(url, "rb", ignore_ext=True) as fin: - with smart_open.open(fname, 'wb', ignore_ext=True) as fout: + try: + kwargs = { 'compression': smart_open.compression.NO_COMPRESSION } + fin = smart_open.open(url, "rb", **kwargs) + except (AttributeError, TypeError): + kwargs = { 'ignore_ext': True } + fin = smart_open.open(url, "rb", **kwargs) + if fin: + with smart_open.open(fname, 'wb', **kwargs) as fout: while True: buf = fin.read(io.DEFAULT_BUFFER_SIZE) if not buf: break fout.write(buf) + fin.close() return fname diff --git a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 index 7de0204a7c..b2b072c7e3 100644 --- a/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 +++ b/docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5 @@ -1 +1 @@ -7020ef8545a05962fe2d7146b4b95f11 \ No newline at end of file +507b6c07ce76db341761559a96daa17d \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py b/docs/src/auto_examples/tutorials/run_ensemblelda.py index aa87d0ecd3..1cb34ec17e 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py @@ -29,6 +29,9 @@ from gensim.corpora import Dictionary from nltk.stem.wordnet import WordNetLemmatizer +from nltk import download +download('wordnet') + lemmatizer = WordNetLemmatizer() docs = api.load('text8') diff --git a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 index f09f123fba..620d90e6ee 100644 --- a/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_ensemblelda.py.md5 @@ -1 +1 @@ -be0c32b18644ebb1a7826764b37ebc01 \ No newline at end of file +9f666b02b1eeac820f2a2200e6d14f6e \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py index 74956619a9..85c3aa6aca 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py +++ b/docs/src/auto_examples/tutorials/run_lda.py @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # easy to read is very desirable in topic modelling. # +# Download the WordNet data +from nltk import download +download('wordnet') + # Lemmatize the documents. from nltk.stem.wordnet import WordNetLemmatizer diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5 index 515f059550..7bb8b3eb9a 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_lda.py.md5 @@ -1 +1 @@ -0995a15406049093d95974700d471876 \ No newline at end of file +802d286d0c620260af50bf5ef0e08253 \ No newline at end of file diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py index d03726dabb..dc4c9d0bdf 100644 --- a/docs/src/check_gallery.py +++ b/docs/src/check_gallery.py @@ -20,6 +20,8 @@ def different(path1, path2): curr_dir = os.path.dirname(__file__) +docs_dir = os.path.dirname(curr_dir) +src_dir = os.path.dirname(docs_dir) stale = [] for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')): for f in files: @@ -27,6 +29,9 @@ def different(path1, path2): source_path = os.path.join(root, f) cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/') + rel_source_path = os.path.relpath(source_path, src_dir) + rel_cache_path = os.path.relpath(cache_path, src_dir) + # # We check two things: # @@ -40,7 +45,7 @@ def different(path1, path2): # but we run them both because it's trivial. # if different(source_path, cache_path): - stale.append(cache_path) + stale.append(f"{rel_source_path} != {rel_cache_path}") continue actual_md5 = hashlib.md5() @@ -52,9 +57,10 @@ def different(path1, path2): expected_md5 = fin.read() if actual_md5.hexdigest() != expected_md5: - stale.append(cache_path) + stale.append(f"{rel_source_path} md5 != {rel_cache_path}.md5") if stale: + stale = '\n'.join(stale) print(f"""The gallery cache appears stale. Rebuild the documentation using the following commands from the gensim root subdirectory: @@ -64,6 +70,7 @@ def different(path1, path2): and then run `git add docs/src/auto_examples` to update the cache. -Stale files: {stale} +Stale files: +{stale} """, file=sys.stderr) sys.exit(1) diff --git a/docs/src/conf.py b/docs/src/conf.py index a005ea5e76..25d4bc1c1e 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -254,10 +254,11 @@ def sort_key(source_dir): 'run_word2vec.py', 'run_doc2vec_lee.py', 'run_fasttext.py', + 'run_ensemblelda.py', 'run_annoy.py', 'run_lda.py', 'run_wmd.py', - 'run_summarization.py', + 'run_scm.py', ] howto_order = [ diff --git a/docs/src/gallery/howtos/run_doc2vec_imdb.py b/docs/src/gallery/howtos/run_doc2vec_imdb.py index 36e19ce898..6acca81d65 100644 --- a/docs/src/gallery/howtos/run_doc2vec_imdb.py +++ b/docs/src/gallery/howtos/run_doc2vec_imdb.py @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v return fname # Download the file to local storage first. - with smart_open.open(url, "rb", ignore_ext=True) as fin: - with smart_open.open(fname, 'wb', ignore_ext=True) as fout: + try: + kwargs = { 'compression': smart_open.compression.NO_COMPRESSION } + fin = smart_open.open(url, "rb", **kwargs) + except (AttributeError, TypeError): + kwargs = { 'ignore_ext': True } + fin = smart_open.open(url, "rb", **kwargs) + if fin: + with smart_open.open(fname, 'wb', **kwargs) as fout: while True: buf = fin.read(io.DEFAULT_BUFFER_SIZE) if not buf: break fout.write(buf) + fin.close() return fname diff --git a/docs/src/gallery/tutorials/run_ensemblelda.py b/docs/src/gallery/tutorials/run_ensemblelda.py index aa87d0ecd3..1cb34ec17e 100644 --- a/docs/src/gallery/tutorials/run_ensemblelda.py +++ b/docs/src/gallery/tutorials/run_ensemblelda.py @@ -29,6 +29,9 @@ from gensim.corpora import Dictionary from nltk.stem.wordnet import WordNetLemmatizer +from nltk import download +download('wordnet') + lemmatizer = WordNetLemmatizer() docs = api.load('text8') diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py index 74956619a9..85c3aa6aca 100644 --- a/docs/src/gallery/tutorials/run_lda.py +++ b/docs/src/gallery/tutorials/run_lda.py @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # easy to read is very desirable in topic modelling. # +# Download the WordNet data +from nltk import download +download('wordnet') + # Lemmatize the documents. from nltk.stem.wordnet import WordNetLemmatizer diff --git a/setup.py b/setup.py index d47dc7e508..025d8f4af1 100644 --- a/setup.py +++ b/setup.py @@ -323,6 +323,7 @@ def run(self): 'memory_profiler', 'annoy', 'Pyro4', + 'scikit-learn', 'nltk', 'testfixtures', 'statsmodels',