Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doc fixes and separate workflow for building docs via CI #3462

Merged
merged 8 commits into from
May 25, 2023
68 changes: 68 additions & 0 deletions .github/workflows/build-docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#
# This workflow rebuilds documentation and stores the resulting patch as a
# workflow artifact. We can then download the artifact, apply the patch, and
# then push the changes.
#
# It's possible to do all this locally on a developer's machine, but it's not
# trivial, because it requires many pre-requisites.
#
name: Rebuild documentation
on: workflow_dispatch
jobs:
docs:
name: Rebuild documentation
timeout-minutes: 180
runs-on: ubuntu-20.04
defaults:
run:
shell: bash

steps:
- uses: actions/checkout@v3
- name: Setup up Python ${{ matrix.python }}
uses: actions/setup-python@v4
with:
#
# We use Py3.8 here for historical reasons.
#
python-version: "3.8"

- name: Update pip
run: python -m pip install -U pip

- name: Install apt packages for LaTeX rendering
run: |
sudo apt-get -yq update
sudo apt-get -yq remove texlive-binaries --purge
sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk
sudo apt-get -yq install build-essential python3.8-dev
- name: Install gensim and its dependencies
run: pip install -e .[docs]

- name: Build documentation
run: |
python setup.py build_ext --inplace
make -C docs/src clean html

- name: Check changes to prebuilt docs
run: |
git config user.email "[email protected]"
git config user.name "Gensim Docs Build"
if ! git diff --quiet @ ; then
git add .
branch="$GITHUB_HEAD_REF ($GITHUB_REF_NAME)"
git commit -m "Import rebuilt documentation for branch $branch"
git format-patch @^
git bundle create prebuilt-docs-changes.bundle @^...@
git reset --mixed @^
git diff --exit-code --stat @
fi

- name: Upload prebuilt docs changes
if: always()
uses: actions/upload-artifact@v3
with:
name: prebuilt-docs-changes
path: |
*.patch
*.bundle
4 changes: 0 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,6 @@ jobs:
run: |
python setup.py build_ext --inplace
make -C docs/src clean html
#
# FIXME: do we want to store the built documentation somewhere, or is
# knowing that the docs built successfully enough?
#

tests:
name: test ${{ matrix.os }} python ${{ matrix.python }}
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ data
*.inv
*.js
docs/_images/
docs/_downloads/

#
# Generated by Cython
Expand Down
11 changes: 9 additions & 2 deletions docs/src/auto_examples/howtos/run_doc2vec_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
return fname

# Download the file to local storage first.
with smart_open.open(url, "rb", ignore_ext=True) as fin:
with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
try:
kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
fin = smart_open.open(url, "rb", **kwargs)
except (AttributeError, TypeError):
kwargs = { 'ignore_ext': True }
fin = smart_open.open(url, "rb", **kwargs)
if fin:
with smart_open.open(fname, 'wb', **kwargs) as fout:
while True:
buf = fin.read(io.DEFAULT_BUFFER_SIZE)
if not buf:
break
fout.write(buf)
fin.close()

return fname

Expand Down
2 changes: 1 addition & 1 deletion docs/src/auto_examples/howtos/run_doc2vec_imdb.py.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
7020ef8545a05962fe2d7146b4b95f11
507b6c07ce76db341761559a96daa17d
3 changes: 3 additions & 0 deletions docs/src/auto_examples/tutorials/run_ensemblelda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from gensim.corpora import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer

from nltk import download
download('wordnet')

lemmatizer = WordNetLemmatizer()
docs = api.load('text8')

Expand Down
2 changes: 1 addition & 1 deletion docs/src/auto_examples/tutorials/run_ensemblelda.py.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
be0c32b18644ebb1a7826764b37ebc01
9f666b02b1eeac820f2a2200e6d14f6e
4 changes: 4 additions & 0 deletions docs/src/auto_examples/tutorials/run_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
# easy to read is very desirable in topic modelling.
#

# Download the WordNet data
from nltk import download
download('wordnet')

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

Expand Down
2 changes: 1 addition & 1 deletion docs/src/auto_examples/tutorials/run_lda.py.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0995a15406049093d95974700d471876
802d286d0c620260af50bf5ef0e08253
13 changes: 10 additions & 3 deletions docs/src/check_gallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@ def different(path1, path2):


curr_dir = os.path.dirname(__file__)
docs_dir = os.path.dirname(curr_dir)
src_dir = os.path.dirname(docs_dir)
stale = []
for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')):
for f in files:
if f.endswith('.py'):
source_path = os.path.join(root, f)
cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/')

rel_source_path = os.path.relpath(source_path, src_dir)
rel_cache_path = os.path.relpath(cache_path, src_dir)

#
# We check two things:
#
Expand All @@ -40,7 +45,7 @@ def different(path1, path2):
# but we run them both because it's trivial.
#
if different(source_path, cache_path):
stale.append(cache_path)
stale.append(f"{rel_source_path} != {rel_cache_path}")
continue

actual_md5 = hashlib.md5()
Expand All @@ -52,9 +57,10 @@ def different(path1, path2):
expected_md5 = fin.read()

if actual_md5.hexdigest() != expected_md5:
stale.append(cache_path)
stale.append(f"{rel_source_path} md5 != {rel_cache_path}.md5")

if stale:
stale = '\n'.join(stale)
print(f"""The gallery cache appears stale.

Rebuild the documentation using the following commands from the gensim root subdirectory:
Expand All @@ -64,6 +70,7 @@ def different(path1, path2):

and then run `git add docs/src/auto_examples` to update the cache.

Stale files: {stale}
Stale files:
{stale}
""", file=sys.stderr)
sys.exit(1)
3 changes: 2 additions & 1 deletion docs/src/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,11 @@ def sort_key(source_dir):
'run_word2vec.py',
'run_doc2vec_lee.py',
'run_fasttext.py',
'run_ensemblelda.py',
'run_annoy.py',
'run_lda.py',
'run_wmd.py',
'run_summarization.py',
'run_scm.py',
]

howto_order = [
Expand Down
11 changes: 9 additions & 2 deletions docs/src/gallery/howtos/run_doc2vec_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,20 @@ def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v
return fname

# Download the file to local storage first.
with smart_open.open(url, "rb", ignore_ext=True) as fin:
with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
try:
kwargs = { 'compression': smart_open.compression.NO_COMPRESSION }
fin = smart_open.open(url, "rb", **kwargs)
except (AttributeError, TypeError):
kwargs = { 'ignore_ext': True }
fin = smart_open.open(url, "rb", **kwargs)
if fin:
with smart_open.open(fname, 'wb', **kwargs) as fout:
while True:
buf = fin.read(io.DEFAULT_BUFFER_SIZE)
if not buf:
break
fout.write(buf)
fin.close()

return fname

Expand Down
3 changes: 3 additions & 0 deletions docs/src/gallery/tutorials/run_ensemblelda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from gensim.corpora import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer

from nltk import download
download('wordnet')

lemmatizer = WordNetLemmatizer()
docs = api.load('text8')

Expand Down
4 changes: 4 additions & 0 deletions docs/src/gallery/tutorials/run_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
# easy to read is very desirable in topic modelling.
#

# Download the WordNet data
from nltk import download
download('wordnet')

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ def run(self):
'memory_profiler',
'annoy',
'Pyro4',
'scikit-learn',
'nltk',
'testfixtures',
'statsmodels',
Expand Down