diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5678813004..effb786452 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,35 @@ on: env: GHA_CACHE_KEY_VERSION: "v1" +permissions: + id-token: write + contents: read + jobs: + test_logins: + runs-on: ubuntu-latest + steps: + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + id: gauth + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'run gcloud command' + run: |- + gcloud projects list + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: 'azure test command' + run: |- + az account show setup: strategy: matrix: @@ -268,6 +296,7 @@ jobs: test_ingest: + environment: ci strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] @@ -276,7 +305,23 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest, lint] steps: - - uses: actions/checkout@v3 + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + create_credentials_file: true + activate_credentials_file: true + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 913ee0db18..96ba805b36 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -8,7 +8,36 @@ env: GHA_CACHE_KEY_VERSION: "v1" PYTHON_VERSION: "3.10" +permissions: + id-token: write + contents: read + jobs: + test_logins: + runs-on: ubuntu-latest + environment: ci + steps: + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + id: gauth + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'run gcloud command' + run: |- + gcloud projects list + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: 'azure test command' + run: |- + az account show setup: runs-on: ubuntu-latest if: | @@ -75,12 +104,36 @@ jobs: make install-all-ingest update-fixtures-and-pr: + environment: ci runs-on: ubuntu-latest-m env: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: - - uses: actions/checkout@v3 + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + create_credentials_file: true + activate_credentials_file: true + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + - name: Get full Python version + id: full-python-version + run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - uses: actions/cache/restore@v3 id: virtualenv-cache with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 30cce43b31..450276e9e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.28-dev4 +## 0.10.28-dev5 ### Enhancements @@ -6,6 +6,11 @@ * **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure. * **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files * **Remove pdfminer elements from inside tables** Previously, when using `hi_res` some elements where extracted using pdfminer too, so we removed pdfminer from the tables pipeline to avoid duplicated elements. +* **Fsspec downstream connectors** New destination connector added to ingest CLI, users may now use `unstructured-ingest` to write to any of the following: + * Azure + * Box + * Dropbox + * Google Cloud Service ### Features @@ -1609,4 +1614,4 @@ of an email. ## 0.2.0 -* Initial release of unstructured +* Initial release of unstructured \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/handbook-1p.docx.json rename to test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-1/ideas-page.html.json rename to test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-2/ideas-page.html.json rename to test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/science-exploration-1p.pptx.json rename to test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/example-docs/fake-html-cp1252.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/example-docs/fake-html-cp1252.html.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json rename to test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv index 2553a0f90e..0054e9a7e4 100644 --- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ strategy average sample_sd population_sd count -cct-accuracy 0.777 0.088 0.072 3 -cct-%missing 0.087 0.045 0.037 3 +cct-accuracy 0.798 0.083 0.072 4 +cct-%missing 0.087 0.037 0.032 4 diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv index f9be87f7ab..15a1d1d045 100644 --- a/test_unstructured_ingest/metrics/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv @@ -1,4 +1,4 @@ filename connector cct-accuracy cct-%missing science-exploration-1p.pptx box 0.861 0.09 example-10k.html local 0.686 0.04 -IRS-form-1987.pdf azure 0.783 0.13 +IRS-form-1987.pdf azure 0.783 0.13 \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-azure-dest.sh b/test_unstructured_ingest/test-ingest-azure-dest.sh new file mode 100755 index 0000000000..f01eda5bcc --- /dev/null +++ b/test_unstructured_ingest/test-ingest-azure-dest.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=azure-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} + +if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then + echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." + exit 0 +fi + +CONTAINER=utic-ingest-test-fixtures-output +DIRECTORY=$(date +%s) +REMOTE_URL="abfs://$CONTAINER/$DIRECTORY/" + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + + echo "deleting azure storage blob directory $CONTAINER/$DIRECTORY" + az storage fs directory delete -f "$CONTAINER" -n "$DIRECTORY" --connection-string "$AZURE_DEST_CONNECTION_STR" --yes + +} +trap cleanup EXIT + +# Create directory to use for testing +az storage fs directory create -f "$CONTAINER" --n "$DIRECTORY" --connection-string "$AZURE_DEST_CONNECTION_STR" + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + azure \ + --overwrite \ + --remote-url "$REMOTE_URL" \ + --connection-string "$AZURE_DEST_CONNECTION_STR" + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_azure=$(az storage blob list -c "$CONTAINER" --prefix "$DIRECTORY"/example-docs/ --connection-string "$AZURE_DEST_CONNECTION_STR" | jq 'length') +if [ "$num_files_in_azure" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to azure, but found $num_files_in_azure files." + exit 1 +fi diff --git a/test_unstructured_ingest/test-ingest-box-dest.sh b/test_unstructured_ingest/test-ingest-box-dest.sh new file mode 100755 index 0000000000..248c9b75eb --- /dev/null +++ b/test_unstructured_ingest/test-ingest-box-dest.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +#TODO currently box api/sdk does not work to create folders and check for content similar to other fsspec ingest tests + +# +#set -e +# +#SCRIPT_DIR=$(dirname "$(realpath "$0")") +#cd "$SCRIPT_DIR"/.. || exit 1 +#OUTPUT_FOLDER_NAME=box-dest +#OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +#WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +#max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +#DESTINATION_BOX="box://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(date +%s)/" +# +#CI=${CI:-"false"} +# +#if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then +# echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." +# exit 0 +#fi +# +#if [ -z "$BOX_APP_CONFIG_PATH" ]; then +# # Create temporary service key file +# BOX_APP_CONFIG_PATH=$(mktemp) +# echo "$BOX_APP_CONFIG" >"$BOX_APP_CONFIG_PATH" +#fi +# +## shellcheck disable=SC1091 +#source "$SCRIPT_DIR"/cleanup.sh +#function cleanup() { +# cleanup_dir "$OUTPUT_DIR" +# cleanup_dir "$WORK_DIR" +# if [ "$CI" == "true" ]; then +# cleanup_dir "$DOWNLOAD_DIR" +# fi +#} +#trap cleanup EXIT +# +#PYTHONPATH=. ./unstructured/ingest/main.py \ +# local \ +# --num-processes "$max_processes" \ +# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ +# --output-dir "$OUTPUT_DIR" \ +# --strategy fast \ +# --verbose \ +# --reprocess \ +# --input-path example-docs/fake-memo.pdf \ +# --work-dir "$WORK_DIR" \ +# box \ +# --box-app-config "$BOX_APP_CONFIG_PATH" \ +# --remote-url "$DESTINATION_BOX" \ +# +## Simply check the number of files uploaded +#expected_num_files=1 diff --git a/test_unstructured_ingest/test-ingest-dropbox-dest.sh b/test_unstructured_ingest/test-ingest-dropbox-dest.sh new file mode 100755 index 0000000000..cb423aa591 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-dropbox-dest.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=dropbox-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_DROPBOX="/test-output/$(date +%s)" +CI=${CI:-"false"} + +if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then + echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" + echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" + exit 0 +fi + +# Get a new access token from Dropbox +DROPBOX_RESPONSE=$(curl -s https://api.dropbox.com/oauth2/token -d refresh_token="$DROPBOX_REFRESH_TOKEN" -d grant_type=refresh_token -d client_id="$DROPBOX_APP_KEY" -d client_secret="$DROPBOX_APP_SECRET") +DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<< "$DROPBOX_RESPONSE") + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi + + echo "deleting test folder $DESTINATION_DROPBOX" + curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"path\":\"$DESTINATION_DROPBOX\"}" | jq +} +trap cleanup EXIT + +# Create new folder for test +echo "creating temp directory in dropbox for testing: $DESTINATION_DROPBOX" +response=$(curl -X POST -s -w "\n%{http_code}" https://api.dropboxapi.com/2/files/create_folder_v2 \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"autorename\":false,\"path\":\"$DESTINATION_DROPBOX\"}"); +http_code=$(tail -n1 <<< "$response") # get the last line +content=$(sed '$ d' <<< "$response") # get all but the last line which contains the status code + +if [ "$http_code" -ge 300 ]; then + echo "Failed to create temp dir in dropbox: [$http_code] $content" + exit 1 +else + echo "$http_code:" + jq <<< "$content" +fi + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + dropbox \ + --token "$DROPBOX_ACCESS_TOKEN" \ + --remote-url "dropbox://$DESTINATION_DROPBOX" \ + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_dropbox=$(curl -X POST https://api.dropboxapi.com/2/files/list_folder \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"path\":\"$DESTINATION_DROPBOX/example-docs/\"}" | jq '.entries | length') +if [ "$num_files_in_dropbox" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to dropbox, but found $num_files_in_dropbox files." + exit 1 +fi diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index fb9d09eb7f..697baebfcb 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -43,7 +43,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --token "$DROPBOX_ACCESS_TOKEN" \ --recursive \ - --remote-url "dropbox:// /" \ + --remote-url "dropbox://test-input/" \ --work-dir "$WORK_DIR" diff --git a/test_unstructured_ingest/test-ingest-gcs-dest.sh b/test_unstructured_ingest/test-ingest-gcs-dest.sh new file mode 100755 index 0000000000..2437b68051 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-gcs-dest.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=gcs-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_GCS="gs://utic-test-ingest-fixtures-output/$(date +%s)" +CI=${CI:-"false"} + +if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then + echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." + exit 0 +fi + +# Create temporary service key file +GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi + + if gcloud storage ls "$DESTINATION_GCS"; then + echo "deleting $DESTINATION_GCS" + gcloud storage rm --recursive "$DESTINATION_GCS" + fi + +} + +trap cleanup EXIT + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + gcs \ + --token "$GCP_INGEST_SERVICE_KEY_FILE" \ + --remote-url "$DESTINATION_GCS" + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_gcs=$(gcloud storage ls "$DESTINATION_GCS"/example-docs/ | wc -l ) +if [ "$num_files_in_gcs" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to gcs, but found $num_files_in_gcs files." + exit 1 +fi diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 4ce6cf227d..2efe7cc257 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -29,7 +29,7 @@ fi # Create temporary service key file GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) -echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" PYTHONPATH=. ./unstructured/ingest/main.py \ gcs \ diff --git a/test_unstructured_ingest/test-ingest-s3-dest.sh b/test_unstructured_ingest/test-ingest-s3-dest.sh new file mode 100755 index 0000000000..3160a0d917 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-s3-dest.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=s3-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_S3="s3://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(date +%s)/" +CI=${CI:-"false"} + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi + + if aws s3 ls "$DESTINATION_S3" --region us-east-2; then + echo "deleting destination s3 location: $DESTINATION_S3" + aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2 + fi + +} +trap cleanup EXIT + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + s3 \ + --anonymous \ + --remote-url "$DESTINATION_S3" + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}example-docs/" --region us-east-2 | grep -c "\.json$") +if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." + exit 1 +else + echo "Expected number of files found: $num_files_in_s3/$expected_num_files" +fi diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 1a9379f173..423eff8e04 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -9,6 +9,11 @@ cd "$SCRIPT_DIR"/.. || exit 1 export OMP_THREAD_LIMIT=1 all_tests=( +'test-ingest-azure-dest.sh' +'test-ingest-box-dest.sh' +'test-ingest-dropbox-dest.sh' +'test-ingest-gcs-dest.sh' +'test-ingest-s3-dest.sh' 'test-ingest-s3.sh' 'test-ingest-s3-minio.sh' 'test-ingest-azure.sh' diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 606e14436c..e24bf1b0e8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.28-dev4" # pragma: no cover +__version__ = "0.10.28-dev5" # pragma: no cover diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index ee494e4858..7a75a5666b 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -6,17 +6,22 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd from .airtable import get_base_src_cmd as airtable_base_src_cmd +from .azure import get_base_dest_cmd as azure_base_dest_cmd from .azure import get_base_src_cmd as azure_base_src_cmd from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd from .biomed import get_base_src_cmd as biomed_base_src_cmd +from .box import get_base_dest_cmd as box_base_dest_cmd from .box import get_base_src_cmd as box_base_src_cmd from .confluence import get_base_src_cmd as confluence_base_src_cmd from .delta_table import get_base_dest_cmd as delta_table_dest_cmd from .delta_table import get_base_src_cmd as delta_table_base_src_cmd from .discord import get_base_src_cmd as discord_base_src_cmd +from .dropbox import get_base_dest_cmd as dropbox_base_dest_cmd from .dropbox import get_base_src_cmd as dropbox_base_src_cmd from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd +from .fsspec import get_base_dest_cmd as fsspec_base_dest_cmd from .fsspec import get_base_src_cmd as fsspec_base_src_cmd +from .gcs import get_base_dest_cmd as gcs_base_dest_cmd from .gcs import get_base_src_cmd as gcs_base_src_cmd from .github import get_base_src_cmd as github_base_src_cmd from .gitlab import get_base_src_cmd as gitlab_base_src_cmd @@ -76,6 +81,11 @@ ) base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ + azure_base_dest_cmd, + box_base_dest_cmd, + dropbox_base_dest_cmd, + fsspec_base_dest_cmd, + gcs_base_dest_cmd, s3_base_dest_cmd, azure_cognitive_search_base_dest_cmd, delta_table_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/azure.py b/unstructured/ingest/cli/cmds/azure.py index 12537fe52a..133f840d57 100644 --- a/unstructured/ingest/cli/cmds/azure.py +++ b/unstructured/ingest/cli/cmds/azure.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "azure" + @dataclass class AzureCliConfig(BaseConfig, CliMixin): @@ -40,5 +42,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="azure", cli_config=AzureCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=AzureCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=AzureCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/cmds/box.py b/unstructured/ingest/cli/cmds/box.py index 35e3c58d9b..8a361152d3 100644 --- a/unstructured/ingest/cli/cmds/box.py +++ b/unstructured/ingest/cli/cmds/box.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "box" + @dataclass class BoxCliConfig(BaseConfig, CliMixin): @@ -27,5 +29,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="box", cli_config=BoxCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=BoxCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=BoxCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/cmds/dropbox.py b/unstructured/ingest/cli/cmds/dropbox.py index 13f21ca998..0df8b55bb3 100644 --- a/unstructured/ingest/cli/cmds/dropbox.py +++ b/unstructured/ingest/cli/cmds/dropbox.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "dropbox" + @dataclass class DropboxCliConfig(BaseConfig, CliMixin): @@ -27,5 +29,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="dropbox", cli_config=DropboxCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=DropboxCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=DropboxCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec.py b/unstructured/ingest/cli/cmds/fsspec.py index d081c45b01..e2d50a278d 100644 --- a/unstructured/ingest/cli/cmds/fsspec.py +++ b/unstructured/ingest/cli/cmds/fsspec.py @@ -1,6 +1,15 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd +CMD_NAME = "fsspec" + def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="fsspec", is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/cmds/gcs.py b/unstructured/ingest/cli/cmds/gcs.py index ccec32491b..cb6ea80436 100644 --- a/unstructured/ingest/cli/cmds/gcs.py +++ b/unstructured/ingest/cli/cmds/gcs.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "gcs" + @dataclass class GcsCliConfig(BaseConfig, CliMixin): @@ -29,5 +31,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="gcs", cli_config=GcsCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=GcsCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=GcsCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 0fd688e3e0..20156e46ef 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -473,32 +473,23 @@ def from_dict( doesn't require that as part of the field names in this class. It also checks if the CLI params are provided as intended. """ - - if ( - isinstance(kvs, dict) - and any( - [ - kvs["permissions_application_id"] - or kvs["permissions_client_cred"] - or kvs["permissions_tenant"], - ], - ) - and not all( - [ - kvs["permissions_application_id"] - and kvs["permissions_client_cred"] - and kvs["permissions_tenant"], - ], - ) - ): - raise ValueError( - "Please provide either none or all of the following optional values:\n" - "--permissions-application-id\n" - "--permissions-client-cred\n" - "--permissions-tenant", - ) - if isinstance(kvs, dict): + permissions_application_id = kvs.get("permissions_application_id") + permissions_client_cred = kvs.get("permissions_client_cred") + permissions_tenant = kvs.get("permissions_tenant") + permission_values = [ + permissions_application_id, + permissions_client_cred, + permissions_tenant, + ] + if any(permission_values) and not all(permission_values): + raise ValueError( + "Please provide either none or all of the following optional values:\n" + "--permissions-application-id\n" + "--permissions-client-cred\n" + "--permissions-tenant", + ) + new_kvs = { k[len("permissions_") :]: v # noqa: E203 for k, v in kvs.items() diff --git a/unstructured/ingest/connector/azure.py b/unstructured/ingest/connector/azure.py index 004ca782b4..4f80af2ce1 100644 --- a/unstructured/ingest/connector/azure.py +++ b/unstructured/ingest/connector/azure.py @@ -35,6 +35,7 @@ def __post_init__(self): self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc +@requires_dependencies(["adlfs", "fsspec"], extras="azure") @dataclass class AzureBlobStorageDestinationConnector(FsspecDestinationConnector): connector_config: SimpleAzureBlobStorageConfig diff --git a/unstructured/ingest/connector/box.py b/unstructured/ingest/connector/box.py index 5c63ecd30d..39922f9192 100644 --- a/unstructured/ingest/connector/box.py +++ b/unstructured/ingest/connector/box.py @@ -61,6 +61,7 @@ def __post_init__(self): self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc +@requires_dependencies(["boxfs", "fsspec"], extras="box") @dataclass class BoxDestinationConnector(FsspecDestinationConnector): connector_config: SimpleBoxConfig diff --git a/unstructured/ingest/connector/dropbox.py b/unstructured/ingest/connector/dropbox.py index 000e0709d0..edbb1d0b9b 100644 --- a/unstructured/ingest/connector/dropbox.py +++ b/unstructured/ingest/connector/dropbox.py @@ -135,6 +135,7 @@ def _list_files(self): ] +@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") @dataclass class DropboxDestinationConnector(FsspecDestinationConnector): connector_config: SimpleFsspecConfig diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 1aaf2aeac5..1b3cf5bfbc 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -2,7 +2,7 @@ import os import typing as t from contextlib import suppress -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path, PurePath from unstructured.ingest.compression_support import ( @@ -20,6 +20,7 @@ IngestDocCleanupMixin, SourceConnectorCleanupMixin, SourceMetadata, + WriteConfig, ) from unstructured.ingest.logger import logger from unstructured.utils import ( @@ -221,9 +222,15 @@ def get_ingest_docs(self): return docs +@dataclass +class FsspecWriteConfig(WriteConfig): + write_text_kwargs: t.Dict[str, t.Any] = field(default_factory=dict) + + @dataclass class FsspecDestinationConnector(BaseDestinationConnector): connector_config: SimpleFsspecConfig + write_config: FsspecWriteConfig def initialize(self): from fsspec import AbstractFileSystem, get_filesystem_class @@ -255,13 +262,18 @@ def write_dict( filename.strip(os.sep) if filename else filename ) # Make sure filename doesn't begin with file seperator output_path = str(PurePath(output_folder, filename)) if filename else output_folder - full_output_path = f"s3://{output_path}" - logger.debug(f"uploading content to {full_output_path}") - fs.write_text(full_output_path, json.dumps(json_list, indent=indent), encoding=encoding) + full_dest_path = f"{self.connector_config.protocol}://{output_path}" + logger.debug(f"uploading content to {full_dest_path}") + fs.write_text( + full_dest_path, + json.dumps(json_list, indent=indent), + encoding=encoding, + **self.write_config.write_text_kwargs, + ) def write(self, docs: t.List[BaseIngestDoc]) -> None: for doc in docs: - file_path = doc.base_filename + file_path = doc.base_output_filename filename = file_path if file_path else None with open(doc._output_filename) as json_file: logger.debug(f"uploading content from {doc._output_filename}") diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py index 1a75fef2ea..992bb42c5a 100644 --- a/unstructured/ingest/connector/gcs.py +++ b/unstructured/ingest/connector/gcs.py @@ -35,6 +35,7 @@ def __post_init__(self): self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc +@requires_dependencies(["gcsfs", "fsspec"], extras="gcs") @dataclass class GcsDestinationConnector(FsspecDestinationConnector): connector_config: SimpleGcsConfig diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index b23a1021de..7c753acb5d 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -39,6 +39,13 @@ class LocalIngestDoc(BaseIngestDoc): path: str registry_name: str = "local" + @property + def base_filename(self) -> t.Optional[str]: + download_path = str(Path(self.connector_config.input_path).resolve()) + full_path = str(self.filename) + base_path = full_path.replace(download_path, "") + return base_path + @property def filename(self): """The filename of the local file to be processed""" @@ -71,7 +78,7 @@ def _output_filename(self) -> Path: """ input_path = Path(self.connector_config.input_path) basename = ( - f"{Path(self.path).name}.json" + f"{self.base_filename}.json" if input_path.is_file() else f"{Path(self.path).relative_to(input_path)}.json" ) diff --git a/unstructured/ingest/connector/s3.py b/unstructured/ingest/connector/s3.py index b3699025f0..8b4ec7a350 100644 --- a/unstructured/ingest/connector/s3.py +++ b/unstructured/ingest/connector/s3.py @@ -34,6 +34,7 @@ def __post_init__(self): self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc +@requires_dependencies(["s3fs", "fsspec"], extras="s3") @dataclass class S3DestinationConnector(FsspecDestinationConnector): connector_config: SimpleS3Config diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index e15312e03b..382962ef3c 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -132,6 +132,13 @@ def __post_init__(self): self.file_path = "" return + # dropbox paths can start with slash + match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url) + if match and self.protocol == "dropbox": + self.dir_path = match.group(1) + self.file_path = match.group(2) or "" + return + # just a path with no trailing prefix match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url) if match: @@ -334,6 +341,15 @@ def base_filename(self) -> t.Optional[str]: return base_path return None + @property + def base_output_filename(self) -> t.Optional[str]: + if self.processor_config.output_dir and self._output_filename: + output_path = str(Path(self.processor_config.output_dir).resolve()) + full_path = str(self._output_filename) + base_path = full_path.replace(output_path, "") + return base_path + return None + @property @abstractmethod def _output_filename(self): diff --git a/unstructured/ingest/runner/writers.py b/unstructured/ingest/runner/writers.py deleted file mode 100644 index 791bbfeefc..0000000000 --- a/unstructured/ingest/runner/writers.py +++ /dev/null @@ -1,83 +0,0 @@ -import typing as t -from pathlib import Path - -from unstructured.ingest.interfaces import WriteConfig -from unstructured.utils import requires_dependencies - - -@requires_dependencies(["s3fs", "fsspec"], extras="s3") -def s3_writer( - remote_url: str, - anonymous: bool, - endpoint_url: t.Optional[str] = None, - verbose: bool = False, - **kwargs, -): - from unstructured.ingest.connector.s3 import ( - S3DestinationConnector, - SimpleS3Config, - ) - - access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} - if endpoint_url: - access_kwargs["endpoint_url"] = endpoint_url - - return S3DestinationConnector( - write_config=WriteConfig(), - connector_config=SimpleS3Config( - remote_url=remote_url, - access_kwargs=access_kwargs, - ), - ) - - -@requires_dependencies(["azure"], extras="azure-cognitive-search") -def azure_cognitive_search_writer( - endpoint: str, - key: str, - index: str, - **kwargs, -): - from unstructured.ingest.connector.azure_cognitive_search import ( - AzureCognitiveSearchDestinationConnector, - AzureCognitiveSearchWriteConfig, - SimpleAzureCognitiveSearchStorageConfig, - ) - - return AzureCognitiveSearchDestinationConnector( - write_config=AzureCognitiveSearchWriteConfig( - index=index, - ), - connector_config=SimpleAzureCognitiveSearchStorageConfig( - endpoint=endpoint, - key=key, - ), - ) - - -@requires_dependencies(["deltalake"], extras="delta-table") -def delta_table_writer( - table_uri: t.Union[str, Path], - write_column: str, - mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", - **kwargs, -): - from unstructured.ingest.connector.delta_table import ( - DeltaTableDestinationConnector, - DeltaTableWriteConfig, - SimpleDeltaTableConfig, - ) - - return DeltaTableDestinationConnector( - write_config=DeltaTableWriteConfig(write_column=write_column, mode=mode), - connector_config=SimpleDeltaTableConfig( - table_uri=table_uri, - ), - ) - - -writer_map: t.Dict[str, t.Callable] = { - "s3": s3_writer, - "delta_table": delta_table_writer, - "azure_cognitive_search": azure_cognitive_search_writer, -} diff --git a/unstructured/ingest/runner/writers/__init__.py b/unstructured/ingest/runner/writers/__init__.py new file mode 100644 index 0000000000..701d77dbe2 --- /dev/null +++ b/unstructured/ingest/runner/writers/__init__.py @@ -0,0 +1,21 @@ +import typing as t + +from .azure import azure_writer +from .azure_cognitive_search import azure_cognitive_search_writer +from .box import box_writer +from .delta_table import delta_table_writer +from .dropbox import dropbox_writer +from .gcs import gcs_writer +from .s3 import s3_writer + +writer_map: t.Dict[str, t.Callable] = { + "azure": azure_writer, + "azure_cognitive_search": azure_cognitive_search_writer, + "box": box_writer, + "delta_table": delta_table_writer, + "dropbox": dropbox_writer, + "gcs": gcs_writer, + "s3": s3_writer, +} + +__all__ = ["writer_map"] diff --git a/unstructured/ingest/runner/writers/azure.py b/unstructured/ingest/runner/writers/azure.py new file mode 100644 index 0000000000..306825eb2f --- /dev/null +++ b/unstructured/ingest/runner/writers/azure.py @@ -0,0 +1,37 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def azure_writer( + remote_url: str, + account_name: t.Optional[str] = None, + account_key: t.Optional[str] = None, + connection_string: t.Optional[str] = None, + overwrite: bool = False, + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.azure import ( + AzureBlobStorageDestinationConnector, + SimpleAzureBlobStorageConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + if account_name: + access_kwargs = { + "account_name": account_name, + "account_key": account_key, + } + elif connection_string: + access_kwargs = {"connection_string": connection_string} + else: + access_kwargs = {} + + return AzureBlobStorageDestinationConnector( + write_config=FsspecWriteConfig(write_text_kwargs={"overwrite": overwrite}), + connector_config=SimpleAzureBlobStorageConfig( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + ) diff --git a/unstructured/ingest/runner/writers/azure_cognitive_search.py b/unstructured/ingest/runner/writers/azure_cognitive_search.py new file mode 100644 index 0000000000..9d69a16d04 --- /dev/null +++ b/unstructured/ingest/runner/writers/azure_cognitive_search.py @@ -0,0 +1,24 @@ +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def azure_cognitive_search_writer( + endpoint: str, + key: str, + index: str, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.azure_cognitive_search import ( + AzureCognitiveSearchDestinationConnector, + AzureCognitiveSearchWriteConfig, + SimpleAzureCognitiveSearchStorageConfig, + ) + + return AzureCognitiveSearchDestinationConnector( + write_config=AzureCognitiveSearchWriteConfig( + index=index, + ), + connector_config=SimpleAzureCognitiveSearchStorageConfig( + endpoint=endpoint, + key=key, + ), + ) diff --git a/unstructured/ingest/runner/writers/box.py b/unstructured/ingest/runner/writers/box.py new file mode 100644 index 0000000000..8dfb0bf901 --- /dev/null +++ b/unstructured/ingest/runner/writers/box.py @@ -0,0 +1,31 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector +from unstructured.utils import requires_dependencies + + +@requires_dependencies(["boxfs", "fsspec"], extras="box") +def box_writer( + remote_url: str, + box_app_config: t.Optional[str], + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + import boxsdk + + from unstructured.ingest.connector.box import ( + BoxDestinationConnector, + SimpleBoxConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + access_kwargs: t.Dict[str, t.Any] = {"box_app_config": box_app_config} + if verbose: + access_kwargs["client_type"] = boxsdk.LoggingClient + return BoxDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleBoxConfig( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + ) diff --git a/unstructured/ingest/runner/writers/delta_table.py b/unstructured/ingest/runner/writers/delta_table.py new file mode 100644 index 0000000000..f0771aa1e7 --- /dev/null +++ b/unstructured/ingest/runner/writers/delta_table.py @@ -0,0 +1,24 @@ +import typing as t +from pathlib import Path + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def delta_table_writer( + table_uri: t.Union[str, Path], + write_column: str, + mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.delta_table import ( + DeltaTableDestinationConnector, + DeltaTableWriteConfig, + SimpleDeltaTableConfig, + ) + + return DeltaTableDestinationConnector( + write_config=DeltaTableWriteConfig(write_column=write_column, mode=mode), + connector_config=SimpleDeltaTableConfig( + table_uri=table_uri, + ), + ) diff --git a/unstructured/ingest/runner/writers/dropbox.py b/unstructured/ingest/runner/writers/dropbox.py new file mode 100644 index 0000000000..2828d33c72 --- /dev/null +++ b/unstructured/ingest/runner/writers/dropbox.py @@ -0,0 +1,24 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def dropbox_writer( + remote_url: str, + token: t.Optional[str], + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.dropbox import ( + DropboxDestinationConnector, + SimpleDropboxConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + return DropboxDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleDropboxConfig( + remote_url=remote_url, + access_kwargs={"token": token}, + ), + ) diff --git a/unstructured/ingest/runner/writers/gcs.py b/unstructured/ingest/runner/writers/gcs.py new file mode 100644 index 0000000000..3f0000d26a --- /dev/null +++ b/unstructured/ingest/runner/writers/gcs.py @@ -0,0 +1,24 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def gcs_writer( + remote_url: str, + token: t.Optional[str], + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + from unstructured.ingest.connector.gcs import ( + GcsDestinationConnector, + SimpleGcsConfig, + ) + + return GcsDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleGcsConfig( + remote_url=remote_url, + access_kwargs={"token": token}, + ), + ) diff --git a/unstructured/ingest/runner/writers/s3.py b/unstructured/ingest/runner/writers/s3.py new file mode 100644 index 0000000000..27bc1dd863 --- /dev/null +++ b/unstructured/ingest/runner/writers/s3.py @@ -0,0 +1,29 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def s3_writer( + remote_url: str, + anonymous: bool, + endpoint_url: t.Optional[str] = None, + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + from unstructured.ingest.connector.s3 import ( + S3DestinationConnector, + SimpleS3Config, + ) + + access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} + if endpoint_url: + access_kwargs["endpoint_url"] = endpoint_url + + return S3DestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleS3Config( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + )