[DO NOT MERGE] Feat: add support for start_index
in html links
extraction
#11781
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: CI | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
branches: [ main ] | |
merge_group: | |
branches: [ main ] | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
setup: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
check-only: 'true' | |
check-deps: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11"] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Check for dependency conflicts | |
run: make check-deps | |
check-extras: | |
strategy: | |
matrix: | |
python-version: [ "3.9","3.10","3.11" ] | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Install all extras | |
run: make check-extras | |
lint: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11"] | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
runs-on: ubuntu-latest | |
needs: [setup, changelog] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Lint | |
run: | | |
source .venv/bin/activate | |
make check | |
shellcheck: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v3 | |
- name: ShellCheck | |
uses: ludeeus/action-shellcheck@master | |
shfmt: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v3 | |
- name: setup shfmt | |
uses: mfinelli/setup-shfmt@v3 | |
- name: Run shfmt | |
run: shfmt -i 2 -d . | |
test_unit: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10","3.11"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
TESSERACT_VERSION : "5.3.4" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+') | |
if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then | |
echo "Tesseract version ${{env.TESSERACT_VERSION}} is required but found version $installed_tesseract_version" | |
exit 1 | |
fi | |
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again | |
make install-ci | |
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true | |
make check-coverage | |
test_chipper: | |
strategy: | |
matrix: | |
python-version: ["3.10"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y poppler-utils | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true | |
test_unit_no_extras: | |
strategy: | |
matrix: | |
python-version: ["3.10"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
run: | | |
source .venv/bin/activate | |
make test-no-extras CI=true | |
test_unit_dependency_extras: | |
# NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix) | |
strategy: | |
matrix: | |
python-version: ["3.10"] | |
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup, lint, test_unit_no_extras] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- uses: actions/cache/restore@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} | |
- name: Setup virtual environment | |
run: | | |
python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }} | |
source .venv-${{ matrix.extra }}/bin/activate | |
make install-base-ci | |
make install-${{ matrix.extra }} | |
- name: Test | |
env: | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
run: | | |
source .venv-${{ matrix.extra }}/bin/activate | |
# NOTE(newelh) - determine what needs to be installed here | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
make test-extra-${{ matrix.extra }} CI=true | |
setup_ingest: | |
strategy: | |
matrix: | |
python-version: [ "3.9","3.10" ] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup] | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
test_ingest_unit: | |
strategy: | |
matrix: | |
python-version: [ "3.9","3.10" ] | |
runs-on: ubuntu-latest | |
needs: [ setup_ingest, lint ] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Test Ingest (unit) | |
run: | | |
source .venv/bin/activate | |
PYTHONPATH=. pytest test_unstructured_ingest/unit | |
test_ingest_src: | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10"] | |
runs-on: ubuntu-latest-m | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup_ingest, lint] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Test (end-to-end) | |
env: | |
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} | |
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} | |
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} | |
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }} | |
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} | |
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} | |
MONGODB_URI: ${{ secrets.MONGODB_URI }} | |
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} | |
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} | |
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} | |
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} | |
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} | |
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} | |
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} | |
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} | |
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} | |
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} | |
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} | |
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} | |
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} | |
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} | |
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} | |
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} | |
TABLE_OCR: "tesseract" | |
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" | |
CI: "true" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
sudo apt-get install diffstat | |
tesseract --version | |
./test_unstructured_ingest/test-ingest-src.sh | |
test_ingest_dest: | |
environment: ci | |
strategy: | |
matrix: | |
python-version: ["3.9","3.10"] | |
runs-on: ubuntu-latest-m | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup_ingest, lint] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-ingest-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Test (end-to-end) | |
env: | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
MONGODB_URI: ${{ secrets.MONGODB_URI }} | |
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} | |
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }} | |
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} | |
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}} | |
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}} | |
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}} | |
ASTRA_DB_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} | |
ASTRA_DB_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} | |
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}} | |
TABLE_OCR: "tesseract" | |
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" | |
CI: "true" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice | |
make install-pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
sudo apt-get install diffstat | |
tesseract --version | |
./test_unstructured_ingest/test-ingest-dest.sh | |
test_unstructured_api_unit: | |
strategy: | |
matrix: | |
# NOTE(yuming): Unstructured API only use Python 3.10 | |
python-version: ["3.10"] | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup, lint] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Setup virtual environment | |
uses: ./.github/actions/base-cache | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Set up flag for running Unstructured API unit tests | |
run: | | |
# NOTE: Change env `SKIP_API_UNIT_FOR_BREAKING_CHANGE` to true if there is a breaking change in Unstructured repo that will break unstructured api unit tests | |
# TODO: Change env back to false once API unit tests is in sync with unstructured repo | |
echo "SKIP_API_UNIT_FOR_BREAKING_CHANGE=true" >> $GITHUB_ENV | |
- name: Set up Python ${{ matrix.python-version }} | |
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Setup virtual environment (no cache hit) | |
if: steps.virtualenv-cache.outputs.cache-hit != 'true' && env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
run: | | |
python${{ matrix.python-version}} -m venv .venv | |
source .venv/bin/activate | |
mkdir "$NLTK_DATA" | |
make install-ci | |
- name: Test Unstructured API Unit | |
if: env.SKIP_API_UNIT_FOR_BREAKING_CHANGE == 'false' | |
run: | | |
source .venv/bin/activate | |
# FIXME (yao): sometimes there is cache but we still miss argilla in the env; so we add make install-ci again | |
make install-ci | |
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor | |
tesseract --version | |
make install-nltk-models | |
make test-unstructured-api-unit | |
changelog: | |
runs-on: ubuntu-latest | |
steps: | |
# need to checkout otherwise paths-filter will fail on merge-queue trigger | |
- uses: actions/checkout@v3 | |
- if: github.ref != 'refs/heads/main' | |
uses: dorny/paths-filter@v2 | |
id: changes | |
with: | |
filters: | | |
src: | |
- 'unstructured/**' | |
- if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' | |
uses: dangoslen/changelog-enforcer@v3 | |
# TODO - figure out best practice for caching docker images | |
# (Using the virtualenv to get pytest) | |
test_dockerfile: | |
runs-on: ubuntu-latest-m | |
needs: [ setup, lint ] | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Test Dockerfile | |
run: | | |
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file | |
make docker-build | |
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true |