Skip to content

Commit

Permalink
Merge branch 'main' into chore/add-contributing
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish authored Dec 17, 2024
2 parents 1b7a767 + b5ff79d commit 5c01047
Show file tree
Hide file tree
Showing 856 changed files with 36,851 additions and 64,982 deletions.
7 changes: 5 additions & 2 deletions .github/actions/base-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,25 @@ runs:
lookup-only: ${{ inputs.check-only }}
- name: Set up Python ${{ inputs.python-version }}
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python-version }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
shell: bash
run: |
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
python${{ inputs.python-version }} -m venv .venv
if [ ! -d ".venv" ]; then
python${{ inputs.python-version }} -m venv .venv
fi
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
python -m ensurepip --upgrade
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
- name: Save Cache
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
id: virtualenv-cache-save
Expand Down
25 changes: 15 additions & 10 deletions .github/actions/base-ingest-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,30 @@ runs:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
lookup-only: ${{ inputs.check-only }}
- name: Restore base virtual environment
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: ./.github/actions/base-cache
with:
python-version: ${{ inputs.python-version }}
- name: Set up Python ${{ inputs.python-version }}
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ inputs.python-version }}
- name: Setup virtual environment (no cache hit)
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
shell: bash
run: |
python${{ inputs.python-version }} -m venv .venv
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
if [ ! -d ".venv" ]; then
python${{ inputs.python-version }} -m venv .venv
fi
source .venv/bin/activate
make install-all-ingest
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
python -m ensurepip --upgrade
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
make install-all-docs
make install-ingest
- name: Save Ingest Cache
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
id: ingest-virtualenv-cache-save
Expand All @@ -45,5 +50,5 @@ runs:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}

175 changes: 11 additions & 164 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ permissions:
id-token: write
contents: read

env:
NLTK_DATA: ${{ github.workspace }}/nltk_data

jobs:
setup:
strategy:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-cache
Expand Down Expand Up @@ -72,15 +73,12 @@ jobs:
- name: Install all doc and test dependencies
run: |
make install-ci
make install-all-ingest
make check-licenses
lint:
strategy:
matrix:
python-version: ["3.9","3.10","3.11"]
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
runs-on: ubuntu-latest
needs: [setup, changelog]
steps:
Expand All @@ -96,6 +94,7 @@ jobs:
- name: Lint
run: |
source .venv/bin/activate
make install-ci
make check
shellcheck:
Expand Down Expand Up @@ -153,39 +152,6 @@ jobs:
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
make check-coverage
test_chipper:
strategy:
matrix:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y poppler-utils
make install-pandoc install-test
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
test_unit_no_extras:
strategy:
matrix:
Expand All @@ -207,6 +173,7 @@ jobs:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
source .venv/bin/activate
make install-ci
make install-nltk-models
make test-no-extras CI=true
Expand All @@ -217,8 +184,6 @@ jobs:
python-version: ["3.10"]
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint, test_unit_no_extras]
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -252,48 +217,21 @@ jobs:
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make install-${{ matrix.extra }}
make test-extra-${{ matrix.extra }} CI=true
setup_ingest:
strategy:
matrix:
python-version: [ "3.9","3.10" ]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}

test_ingest_unit:
strategy:
matrix:
python-version: [ "3.9","3.10" ]
runs-on: ubuntu-latest
needs: [ setup_ingest, lint ]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Test Ingest (unit)
run: |
source .venv/bin/activate
make install-nltk-models
PYTHONPATH=. pytest test_unstructured_ingest/unit
check-only: 'true'

test_ingest_src:
strategy:
Expand Down Expand Up @@ -365,9 +303,9 @@ jobs:
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true"
PYTHON: python${{ matrix.python-version }}
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
Expand All @@ -377,109 +315,17 @@ jobs:
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
make install-all-docs
make install-ingest
./test_unstructured_ingest/test-ingest-src.sh
test_ingest_dest:
environment: ci
strategy:
matrix:
python-version: ["3.9","3.10"]
runs-on: ubuntu-latest-m
needs: [setup_ingest, lint]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
version: '2.22.0'
- name: Test (end-to-end)
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MONGODB_URI: ${{ secrets.MONGODB_URI }}
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }}
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}}
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}}
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}}
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}}
DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}}
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true"
run: |
source .venv/bin/activate
make install-nltk-models
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
./test_unstructured_ingest/test-ingest-dest.sh
test_ingest_help:
environment: ci
strategy:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
needs: [setup_ingest, lint]
steps:
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Validate --help
run: |
source .venv/bin/activate
./test_unstructured_ingest/test-help.sh
test_unstructured_api_unit:
strategy:
matrix:
# NOTE(yuming): Unstructured API only use Python 3.10
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -552,3 +398,4 @@ jobs:
image: "unstructured:dev"
severity-cutoff: critical
only-fixed: true
output-format: table
5 changes: 4 additions & 1 deletion .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Get full Python version
Expand Down Expand Up @@ -94,6 +94,8 @@ jobs:
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
OVERWRITE_FIXTURES: "true"
CI: "true"
Expand All @@ -105,6 +107,7 @@ jobs:
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
./test_unstructured_ingest/test-ingest-src.sh
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ tags
# Persistent undo
[._]*.un~

.DS_Store
*.DS_Store

# Ruff cache
.ruff_cache/
Expand Down
2 changes: 2 additions & 0 deletions .grype.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignore:
- vulnerability: CVE-2024-11053
Loading

0 comments on commit 5c01047

Please sign in to comment.