feat: support table conversion for tabular destination connectors (#1… #7249
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Ingest Test Fixtures Update PR | |
on: | |
push: | |
workflow_dispatch: | |
env: | |
GHA_CACHE_KEY_VERSION: "v1" | |
PYTHON_VERSION: "3.10" | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
test_logins: | |
runs-on: ubuntu-latest | |
environment: ci | |
steps: | |
- uses: 'actions/checkout@v4' | |
- name: 'Google Cloud Auth' | |
uses: 'google-github-actions/auth@v1' | |
id: gauth | |
with: | |
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} | |
- name: 'Set up Cloud SDK' | |
uses: 'google-github-actions/setup-gcloud@v1' | |
- name: 'run gcloud command' | |
run: |- | |
gcloud projects list | |
- name: 'Az CLI login' | |
uses: azure/login@v1 | |
with: | |
client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
- name: 'azure test command' | |
run: |- | |
az account show | |
setup: | |
runs-on: ubuntu-latest-m | |
if: | | |
github.event_name == 'workflow_dispatch' || | |
(github.event_name == 'push' && contains(github.event.head_commit.message, 'ingest-test-fixtures-update')) | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ env.PYTHON_VERSION }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- uses: actions/cache@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }} | |
- name: Setup virtual environment (no cache hit) | |
run: | | |
python${{ env.PYTHON_VERSION }} -m venv .venv | |
source .venv/bin/activate | |
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" | |
make install-ci | |
setup_ingest: | |
runs-on: ubuntu-latest | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup] | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: actions/cache/restore@v3 | |
id: base-virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }} | |
# Due to the dependency on setup, the cache should exist before this is ran. Set failed if it wasn't found. | |
- name: Setup base virtual environment (no cache hit) | |
if: steps.base-virtualenv-cache.outputs.cache-hit != 'true' | |
uses: actions/github-script@v3 | |
with: | |
script: | | |
core.setFailed("base cached environment couldn't be found") | |
- uses: actions/cache@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-ingest-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }} | |
- name: Set up Python ${{ env.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.python-version }} | |
- name: Setup virtual environment (no cache hit) | |
run: | | |
python${{ env.python-version }} -m venv .venv | |
source .venv/bin/activate | |
make install-all-ingest | |
update-fixtures-and-pr: | |
environment: ci | |
runs-on: ubuntu-latest-m | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup_ingest] | |
steps: | |
# actions/checkout MUST come before auth | |
- uses: 'actions/checkout@v4' | |
- name: 'Google Cloud Auth' | |
uses: 'google-github-actions/auth@v1' | |
with: | |
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} | |
create_credentials_file: true | |
- name: 'Set up Cloud SDK' | |
uses: 'google-github-actions/setup-gcloud@v1' | |
- name: 'Az CLI login' | |
uses: azure/login@v1 | |
with: | |
client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Get full Python version | |
id: full-python-version | |
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT | |
- uses: actions/cache/restore@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-ingest-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/ingest/*.txt') }} | |
# Due to the dependency on setup_ingest, the cache should exist before this is ran. Set failed if it wasn't found. | |
- name: Setup base virtual environment (no cache hit) | |
if: steps.virtualenv-cache.outputs.cache-hit != 'true' | |
uses: actions/github-script@v3 | |
with: | |
script: | | |
core.setFailed("cached environment couldn't be found") | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Update test fixtures | |
env: | |
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} | |
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} | |
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} | |
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} | |
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} | |
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} | |
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} | |
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} | |
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} | |
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} | |
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} | |
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} | |
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} | |
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} | |
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} | |
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} | |
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} | |
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} | |
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} | |
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
TABLE_OCR: "tesseract" | |
OCR_AGENT: "tesseract" | |
OVERWRITE_FIXTURES: "true" | |
CI: "true" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
tesseract --version | |
./test_unstructured_ingest/test-ingest-src.sh | |
- name: Save branch name to environment file | |
id: branch | |
run: | | |
original_branch=$(git rev-parse --abbrev-ref HEAD) | |
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)" | |
branch_name="$original_branch$suffix" | |
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV | |
- name: Save PR name to environment file | |
id: pr | |
run: | | |
commit_sha=$(git rev-parse HEAD) | |
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls") | |
pr_name=$(echo "$prs" | jq -r '.[0].title') | |
echo "PR_NAME=$pr_name" >> $GITHUB_ENV | |
- name: Create Pull Request | |
uses: peter-evans/create-pull-request@v5 | |
with: | |
token: ${{ secrets.GH_CREATE_PR_TOKEN }} | |
add-paths: | | |
test_unstructured_ingest/expected-structured-output | |
test_unstructured_ingest/metrics | |
commit-message: "Update ingest test fixtures" | |
branch: ${{ env.BRANCH_NAME }} | |
title: "${{ env.PR_NAME }} <- Ingest test fixtures update" | |
assignees: ${{ github.actor }} | |
reviewers: ${{ github.actor }} | |
delete-branch: true | |
body: | | |
This pull request includes updated ingest test fixtures. | |
Please review and merge if appropriate. | |
base: ${{ github.head_ref }} |