Skip to content

Removes changes on cli interfaces Dict() type #7228

Removes changes on cli interfaces Dict() type

Removes changes on cli interfaces Dict() type #7228

name: Ingest Test Fixtures Update PR
on:
push:
workflow_dispatch:
env:
GHA_CACHE_KEY_VERSION: "v1"
PYTHON_VERSION: "3.10"
permissions:
id-token: write
contents: read
jobs:
test_logins:
runs-on: ubuntu-latest
environment: ci
steps:
- uses: 'actions/checkout@v4'
- name: 'Google Cloud Auth'
uses: 'google-github-actions/auth@v1'
id: gauth
with:
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }}
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v1'
- name: 'run gcloud command'
run: |-
gcloud projects list
- name: 'Az CLI login'
uses: azure/login@v1
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: 'azure test command'
run: |-
az account show
setup:
runs-on: ubuntu-latest-m
if: |
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'push' && contains(github.event.head_commit.message, 'ingest-test-fixtures-update'))
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
- name: Setup virtual environment (no cache hit)
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
make install-ci
setup_ingest:
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v3
- uses: actions/cache/restore@v3
id: base-virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}
# Due to the dependency on setup, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.base-virtualenv-cache.outputs.cache-hit != 'true'
uses: actions/github-script@v3
with:
script: |
core.setFailed("base cached environment couldn't be found")
- uses: actions/cache@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
- name: Set up Python ${{ env.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.python-version }}
- name: Setup virtual environment (no cache hit)
run: |
python${{ env.python-version }} -m venv .venv
source .venv/bin/activate
make install-all-ingest
update-fixtures-and-pr:
environment: ci
runs-on: ubuntu-latest-m
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup_ingest]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- name: 'Google Cloud Auth'
uses: 'google-github-actions/auth@v1'
with:
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }}
create_credentials_file: true
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v1'
- name: 'Az CLI login'
uses: azure/login@v1
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Get full Python version
id: full-python-version
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
- uses: actions/cache/restore@v3
id: virtualenv-cache
with:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }}
# Due to the dependency on setup_ingest, the cache should exist before this is ran. Set failed if it wasn't found.
- name: Setup base virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
uses: actions/github-script@v3
with:
script: |
core.setFailed("cached environment couldn't be found")
- name: Setup docker-compose
uses: KengoTODA/actions-setup-docker-compose@v1
with:
version: '2.22.0'
- name: Update test fixtures
env:
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
TABLE_OCR: "tesseract"
OCR_AGENT: "tesseract"
OVERWRITE_FIXTURES: "true"
CI: "true"
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
tesseract --version
./test_unstructured_ingest/test-ingest-src.sh
- name: Save branch name to environment file
id: branch
run: |
original_branch=$(git rev-parse --abbrev-ref HEAD)
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)"
branch_name="$original_branch$suffix"
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV
- name: Save PR name to environment file
id: pr
run: |
commit_sha=$(git rev-parse HEAD)
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls")
pr_name=$(echo "$prs" | jq -r '.[0].title')
echo "PR_NAME=$pr_name" >> $GITHUB_ENV
- name: Create Pull Request
uses: peter-evans/create-pull-request@v5
with:
token: ${{ secrets.GH_CREATE_PR_TOKEN }}
add-paths: |
test_unstructured_ingest/expected-structured-output
test_unstructured_ingest/metrics
commit-message: "Update ingest test fixtures"
branch: ${{ env.BRANCH_NAME }}
title: "${{ env.PR_NAME }} <- Ingest test fixtures update"
assignees: ${{ github.actor }}
reviewers: ${{ github.actor }}
delete-branch: true
body: |
This pull request includes updated ingest test fixtures.
Please review and merge if appropriate.
base: ${{ github.head_ref }}