Skip to content

Commit

Permalink
Roman/optimize ingest ci (#1799)
Browse files Browse the repository at this point in the history
### Description
Currently the CI caches the CI dependencies but uses the hash of all
files in `requirements/`. This isn't completely accurate since the
ingest dependencies are installed in a later step and don't affect the
cached environment. As part of this PR:
* ingest dependencies were isolated into their own folder in
`requirements/ingest/`
* A new cache setup was introduced in the CI to restore the base cache
-> install ingest dependencies -> cache it with a new id
* new make target created to install all ingest dependencies via `pip
install -r ...`
* updates to Dockerfile to use `find ...` to install all dependencies,
avoiding the need to update this when new deps are added.
* update to pip-compile script to run over all `*.in` files in
`requirements/`
  • Loading branch information
rbiseck3 authored Oct 24, 2023
1 parent 37e8413 commit 4802332
Show file tree
Hide file tree
Showing 82 changed files with 501 additions and 418 deletions.
79 changes: 50 additions & 29 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,56 @@ jobs:
tesseract --version
make test-extra-${{ matrix.extra }} CI=true
setup_ingest:
strategy:
matrix:
python-version: [ "3.8","3.9","3.10","3.11" ]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v3
- uses: actions/cache/restore@v3
id: base-virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# Due to the dependency on setup, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.base-virtualenv-cache.outputs.cache-hit != 'true'
uses: actions/github-script@v3
with:
script: |
core.setFailed('base cached environment couldn't be found')
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment (no cache hit)
run: |
python${{ matrix.python-version }} -m venv .venv
source .venv/bin/activate
make install-all-ingest
test_ingest:
strategy:
matrix:
python-version: ["3.8","3.9","3.10","3.11"]
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
needs: [setup_ingest, lint]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -252,14 +294,14 @@ jobs:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ steps.full-python-version.outputs.version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
- name: Setup virtual environment (no cache hit)
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
# Due to the dependency on setup_ingest, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ matrix.python-version}} -m venv .venv
source .venv/bin/activate
mkdir "$NLTK_DATA"
make install-ci
uses: actions/github-script@v3
with:
script: |
core.setFailed('cached environment couldn't be found')
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
Expand Down Expand Up @@ -316,27 +358,6 @@ jobs:
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
make install-ingest-s3
make install-ingest-airtable
make install-ingest-azure
make install-ingest-box
make install-ingest-confluence
make install-ingest-discord
make install-ingest-elasticsearch
make install-ingest-dropbox
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab
make install-ingest-jira
make install-ingest-onedrive
make install-ingest-outlook
make install-ingest-salesforce
make install-ingest-slack
make install-ingest-wikipedia
make install-ingest-notion
make install-ingest-delta-table
pip install "unstructured[openai]"
./test_unstructured_ingest/test-ingest.sh
test_unstructured_api_unit:
Expand Down
75 changes: 48 additions & 27 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,26 +36,68 @@ jobs:
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
make install-ci
update-fixtures-and-pr:
runs-on: ubuntu-latest-m
setup_ingest:
strategy:
matrix:
python-version: [ "3.8","3.9","3.10","3.11" ]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v3
- uses: actions/cache/restore@v3
id: base-virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# Due to the dependency on setup, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.base-virtualenv-cache.outputs.cache-hit != 'true'
uses: actions/github-script@v3
with:
script: |
core.setFailed('base cached environment couldn't be found')
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment (no cache hit)
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
python${{ matrix.python-version }} -m venv .venv
source .venv/bin/activate
mkdir "$NLTK_DATA"
make install-ci
make install-all-ingest
update-fixtures-and-pr:
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup_ingest]
steps:
- uses: actions/checkout@v3
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
# Due to the dependency on setup_ingest, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
uses: actions/github-script@v3
with:
script: |
core.setFailed('cached environment couldn't be found')
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
Expand Down Expand Up @@ -108,27 +150,6 @@ jobs:
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
tesseract --version
make install-ingest-s3
make install-ingest-azure
make install-ingest-airtable
make install-ingest-box
make install-ingest-confluence
make install-ingest-discord
make install-ingest-elasticsearch
make install-ingest-dropbox
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab
make install-ingest-jira
make install-ingest-onedrive
make install-ingest-outlook
make install-ingest-salesforce
make install-ingest-slack
make install-ingest-wikipedia
make install-ingest-notion
make install-ingest-delta-table
pip install "unstructured[openai]"
./test_unstructured_ingest/test-ingest.sh
- name: Save branch name to environment file
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.10.26-dev2
## 0.10.26-dev3

### Enhancements

Expand Down
31 changes: 1 addition & 30 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,36 +24,7 @@ COPY requirements requirements

RUN python3.10 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/huggingface.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install --no-cache -r requirements/ingest-box.txt && \
pip install --no-cache -r requirements/ingest-confluence.txt && \
pip install --no-cache -r requirements/ingest-discord.txt && \
pip install --no-cache -r requirements/ingest-dropbox.txt && \
pip install --no-cache -r requirements/ingest-elasticsearch.txt && \
pip install --no-cache -r requirements/ingest-gcs.txt && \
pip install --no-cache -r requirements/ingest-github.txt && \
pip install --no-cache -r requirements/ingest-gitlab.txt && \
pip install --no-cache -r requirements/ingest-google-drive.txt && \
pip install --no-cache -r requirements/ingest-notion.txt && \
pip install --no-cache -r requirements/ingest-onedrive.txt && \
pip install --no-cache -r requirements/ingest-outlook.txt && \
pip install --no-cache -r requirements/ingest-reddit.txt && \
pip install --no-cache -r requirements/ingest-s3.txt && \
pip install --no-cache -r requirements/ingest-slack.txt && \
pip install --no-cache -r requirements/ingest-wikipedia.txt && \
pip install --no-cache -r requirements/extra-csv.txt && \
pip install --no-cache -r requirements/extra-docx.txt && \
pip install --no-cache -r requirements/extra-epub.txt && \
pip install --no-cache -r requirements/extra-markdown.txt && \
pip install --no-cache -r requirements/extra-msg.txt && \
pip install --no-cache -r requirements/extra-odt.txt && \
pip install --no-cache -r requirements/extra-pandoc.txt && \
pip install --no-cache -r requirements/extra-pdf-image.txt && \
pip install --no-cache -r requirements/extra-pptx.txt && \
pip install --no-cache -r requirements/extra-xlsx.txt && \
find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \
dnf -y groupremove "Development Tools" && \
dnf clean all

Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ install-xlsx:
.PHONY: install-all-docs
install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-msg install-pdf-image install-pptx install-xlsx

.PHONY: install-all-ingest
install-all-ingest:
find requirements/ingest -type f -name "*.txt" -exec python3 -m pip install -r '{}' ';'


.PHONY: install-ingest-google-drive
install-ingest-google-drive:
python3 -m pip install -r requirements/ingest-google-drive.txt
Expand Down
3 changes: 0 additions & 3 deletions requirements/ingest-airtable.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-azure-cognitive-search.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-azure.in

This file was deleted.

5 changes: 0 additions & 5 deletions requirements/ingest-bedrock.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-biomed.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-box.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-confluence.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-delta-table.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-discord.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-dropbox.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-elasticsearch.in

This file was deleted.

5 changes: 0 additions & 5 deletions requirements/ingest-gcs.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-gitlab.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-google-drive.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-jira.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-notion.in

This file was deleted.

5 changes: 0 additions & 5 deletions requirements/ingest-openai.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-reddit.in

This file was deleted.

4 changes: 0 additions & 4 deletions requirements/ingest-s3.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-salesforce.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-slack.in

This file was deleted.

3 changes: 0 additions & 3 deletions requirements/ingest-wikipedia.in

This file was deleted.

3 changes: 3 additions & 0 deletions requirements/ingest/airtable.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-c ../constraints.in
-c ../base.txt
pyairtable
Loading

0 comments on commit 4802332

Please sign in to comment.