Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Ingest Test Fixtures Update PR | |
on: | |
push: | |
workflow_dispatch: | |
env: | |
GHA_CACHE_KEY_VERSION: "v1" | |
jobs: | |
setup: | |
runs-on: ubuntu-latest-m | |
if: | | |
github.event_name == 'workflow_dispatch' || | |
(github.event_name == 'push' && contains(github.event.head_commit.message, 'ingest-test-fixtures-update')) | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
PYTHON_VERSION: "3.10" | |
steps: | |
- uses: actions/checkout@v3 | |
- name: Set up Python ${{ env.PYTHON_VERSION }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- uses: actions/cache@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }} | |
- name: Setup virtual environment (no cache hit) | |
run: | | |
python${{ env.PYTHON_VERSION }} -m venv .venv | |
source .venv/bin/activate | |
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" | |
make install-ci | |
update-fixtures-and-pr: | |
runs-on: ubuntu-latest-m | |
env: | |
NLTK_DATA: ${{ github.workspace }}/nltk_data | |
needs: [setup] | |
steps: | |
- uses: actions/checkout@v3 | |
- uses: actions/cache/restore@v3 | |
id: virtualenv-cache | |
with: | |
path: | | |
.venv | |
nltk_data | |
key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }} | |
- name: Setup virtual environment (no cache hit) | |
run: | | |
python${{ env.PYTHON_VERSION }} -m venv .venv | |
source .venv/bin/activate | |
mkdir "$NLTK_DATA" | |
make install-ci | |
- name: Setup docker-compose | |
uses: KengoTODA/actions-setup-docker-compose@v1 | |
with: | |
version: '2.22.0' | |
- name: Update test fixtures | |
env: | |
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} | |
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} | |
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} | |
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} | |
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} | |
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} | |
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} | |
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} | |
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} | |
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} | |
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} | |
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} | |
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} | |
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} | |
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} | |
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} | |
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} | |
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} | |
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} | |
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} | |
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} | |
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} | |
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} | |
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} | |
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} | |
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} | |
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | |
UNS_API_KEY: ${{ secrets.UNS_API_KEY }} | |
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} | |
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
TABLE_OCR: "tesseract" | |
OCR_AGENT: "tesseract" | |
OVERWRITE_FIXTURES: "true" | |
CI: "true" | |
run: | | |
source .venv/bin/activate | |
sudo apt-get update | |
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc | |
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 | |
sudo apt-get install -y tesseract-ocr | |
sudo apt-get install -y tesseract-ocr-kor | |
tesseract --version | |
make install-ingest-s3 | |
make install-ingest-azure | |
make install-ingest-airtable | |
make install-ingest-box | |
make install-ingest-confluence | |
make install-ingest-discord | |
make install-ingest-elasticsearch | |
make install-ingest-dropbox | |
make install-ingest-gcs | |
make install-ingest-google-drive | |
make install-ingest-github | |
make install-ingest-gitlab | |
make install-ingest-jira | |
make install-ingest-onedrive | |
make install-ingest-outlook | |
make install-ingest-salesforce | |
make install-ingest-slack | |
make install-ingest-wikipedia | |
make install-ingest-notion | |
make install-ingest-delta-table | |
pip install "unstructured[openai]" | |
./test_unstructured_ingest/test-ingest.sh | |
- name: Save branch name to environment file | |
id: branch | |
run: | | |
original_branch=$(git rev-parse --abbrev-ref HEAD) | |
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)" | |
branch_name="$original_branch$suffix" | |
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV | |
- name: Save PR name to environment file | |
id: pr | |
run: | | |
commit_sha=$(git rev-parse HEAD) | |
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ | |
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls") | |
pr_name=$(echo "$prs" | jq -r '.[0].title') | |
echo "PR_NAME=$pr_name" >> $GITHUB_ENV | |
- name: Create Pull Request | |
uses: peter-evans/create-pull-request@v5 | |
with: | |
token: ${{ secrets.GH_CREATE_PR_TOKEN }} | |
add-paths: | | |
test_unstructured_ingest/expected-structured-output | |
test_unstructured_ingest/metrics | |
commit-message: "Update ingest test fixtures" | |
branch: ${{ env.BRANCH_NAME }} | |
title: "${{ env.PR_NAME }} <- Ingest test fixtures update" | |
assignees: ${{ github.actor }} | |
reviewers: ${{ github.actor }} | |
delete-branch: true | |
body: | | |
This pull request includes updated ingest test fixtures. | |
Please review and merge if appropriate. | |
base: ${{ github.head_ref }} |