From 668bd2967eecd70484ba67f4116caa5b9f1f4f6d Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 2 Nov 2023 16:31:04 -0700 Subject: [PATCH 1/2] chore: update CHANGELOG.md (#1997) Remove bullets not related to end-user consumption of the unstructured library. Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com> --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69e79b13f2..be30d57ab1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,6 @@ ### Enhancements -* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations. -* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. -* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target. * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning. * **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline. * **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available. From d09c8c0cab59ee730e63db564eaa84773e6a9148 Mon Sep 17 00:00:00 2001 From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com> Date: Fri, 3 Nov 2023 08:46:56 -0400 Subject: [PATCH 2/2] test: update ingest dest tests to follow set pattern (#1991) ### Description Update all destination tests to match pattern: * Don't omit any metadata to check full schema * Move azure cognitive dest test from src to dest * Split delta table test into seperate src and dest tests * Fix azure cognitive search and add to dest tests being run (wasn't being run originally) --- CHANGELOG.md | 2 +- .../azure_cognitive_sample_index_schema.json | 8 ++++ .../{src => dest}/azure-cognitive-search.sh | 23 +++++----- test_unstructured_ingest/dest/azure.sh | 1 - test_unstructured_ingest/dest/box.sh | 1 - test_unstructured_ingest/dest/delta-table.sh | 43 +++++++++++++++++++ test_unstructured_ingest/dest/dropbox.sh | 1 - test_unstructured_ingest/dest/gcs.sh | 1 - test_unstructured_ingest/dest/s3.sh | 1 - .../files/azure_cognitive_index_schema.json | 8 ++++ .../python/test-ingest-delta-table-output.py | 10 ++++- test_unstructured_ingest/src/delta-table.sh | 9 +--- test_unstructured_ingest/test-ingest-dest.sh | 2 + unstructured/__version__.py | 2 +- .../connector/azure_cognitive_search.py | 6 +++ 15 files changed, 90 insertions(+), 28 deletions(-) rename test_unstructured_ingest/{src => dest}/azure-cognitive-search.sh (87%) create mode 100755 test_unstructured_ingest/dest/delta-table.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index be30d57ab1..ebe7f95f84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.29-dev10 +## 0.10.29-dev11 ### Enhancements diff --git a/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json b/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json index 3b6e55e568..593d356690 100644 --- a/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json +++ b/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json @@ -82,6 +82,10 @@ "name": "date_processed", "type": "Edm.DateTimeOffset" }, + { + "name": "permissions_data", + "type": "Edm.String" + }, { "name": "record_locator", "type": "Edm.String" @@ -114,6 +118,10 @@ "name": "page_number", "type": "Edm.String" }, + { + "name": "links", + "type": "Collection(Edm.String)" + }, { "name": "url", "type": "Edm.String" diff --git a/test_unstructured_ingest/src/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh similarity index 87% rename from test_unstructured_ingest/src/azure-cognitive-search.sh rename to test_unstructured_ingest/dest/azure-cognitive-search.sh index 8e6224bd9b..debef215bc 100755 --- a/test_unstructured_ingest/src/azure-cognitive-search.sh +++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh @@ -5,10 +5,12 @@ set -e SRC_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$SRC_PATH") cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME +OUTPUT_FOLDER_NAME=azure-cog-search-dest +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} + DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)" # The vector configs on the schema currently only exist on versions: @@ -65,17 +67,14 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - s3 \ - --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --strategy fast \ - --preserve-downloads \ - --reprocess \ - --output-dir "$OUTPUT_DIR" \ - --verbose \ - --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ - --anonymous \ - --work-dir "$WORK_DIR" \ + local \ + --num-processes "$max_processes" \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ azure-cognitive-search \ --key "$AZURE_SEARCH_API_KEY" \ --endpoint "$AZURE_SEARCH_ENDPOINT" \ diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh index c5bb749000..12b2c5c45c 100755 --- a/test_unstructured_ingest/dest/azure.sh +++ b/test_unstructured_ingest/dest/azure.sh @@ -39,7 +39,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ diff --git a/test_unstructured_ingest/dest/box.sh b/test_unstructured_ingest/dest/box.sh index cdbd97c94c..6e44c106d7 100755 --- a/test_unstructured_ingest/dest/box.sh +++ b/test_unstructured_ingest/dest/box.sh @@ -41,7 +41,6 @@ #PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ # local \ # --num-processes "$max_processes" \ -# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ # --output-dir "$OUTPUT_DIR" \ # --strategy fast \ # --verbose \ diff --git a/test_unstructured_ingest/dest/delta-table.sh b/test_unstructured_ingest/dest/delta-table.sh new file mode 100755 index 0000000000..e33a7f1108 --- /dev/null +++ b/test_unstructured_ingest/dest/delta-table.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -e + +SRC_PATH=$(dirname "$(realpath "$0")") +SCRIPT_DIR=$(dirname "$SRC_PATH") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=delta-table-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME +DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +CI=${CI:-"false"} + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh + +function cleanup() { + cleanup_dir "$DESTINATION_TABLE" + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} + +trap cleanup EXIT + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + delta-table \ + --write-column json_data \ + --table-uri "$DESTINATION_TABLE" + +python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE" diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh index 60a011c3e7..32184b7261 100755 --- a/test_unstructured_ingest/dest/dropbox.sh +++ b/test_unstructured_ingest/dest/dropbox.sh @@ -61,7 +61,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh index c5bf144cbf..c65254e4fb 100755 --- a/test_unstructured_ingest/dest/gcs.sh +++ b/test_unstructured_ingest/dest/gcs.sh @@ -44,7 +44,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh index 89540070c8..4b2ed33028 100755 --- a/test_unstructured_ingest/dest/s3.sh +++ b/test_unstructured_ingest/dest/s3.sh @@ -34,7 +34,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ diff --git a/test_unstructured_ingest/files/azure_cognitive_index_schema.json b/test_unstructured_ingest/files/azure_cognitive_index_schema.json index d77fd8da32..522250f98a 100644 --- a/test_unstructured_ingest/files/azure_cognitive_index_schema.json +++ b/test_unstructured_ingest/files/azure_cognitive_index_schema.json @@ -81,6 +81,10 @@ "name": "date_processed", "type": "Edm.DateTimeOffset" }, + { + "name": "permissions_data", + "type": "Edm.String" + }, { "name": "record_locator", "type": "Edm.String" @@ -117,6 +121,10 @@ "name": "page_number", "type": "Edm.String" }, + { + "name": "links", + "type": "Collection(Edm.String)" + }, { "name": "page_name", "type": "Edm.String" diff --git a/test_unstructured_ingest/python/test-ingest-delta-table-output.py b/test_unstructured_ingest/python/test-ingest-delta-table-output.py index 26c873ecd7..ae08c18c39 100755 --- a/test_unstructured_ingest/python/test-ingest-delta-table-output.py +++ b/test_unstructured_ingest/python/test-ingest-delta-table-output.py @@ -10,7 +10,15 @@ def run_check(table_uri): table_uri=table_uri, ) - assert len(delta_table.to_pandas()) == 10 + expected_rows = 5 + found_rows = len(delta_table.to_pandas()) + print( + f"Checking if expected number of rows ({expected_rows}) " + f"matches how many were found: {found_rows}" + ) + assert ( + expected_rows == found_rows + ), f"expected number of rows doesn't match how many were found: {expected_rows}/{found_rows}" print("table check complete") diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index e926959665..13b9e58135 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -10,7 +10,6 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} CI=${CI:-"false"} @@ -23,7 +22,6 @@ fi source "$SCRIPT_DIR"/cleanup.sh function cleanup() { - cleanup_dir "$DESTINATION_TABLE" cleanup_dir "$OUTPUT_DIR" cleanup_dir "$WORK_DIR" if [ "$CI" == "true" ]; then @@ -44,13 +42,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \ --preserve-downloads \ --verbose \ - --work-dir "$WORK_DIR" \ - delta-table \ - --write-column json_data \ - --table-uri "$DESTINATION_TABLE" + --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME -python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE" - "$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh index 707dc5a192..421702e2b9 100755 --- a/test_unstructured_ingest/test-ingest-dest.sh +++ b/test_unstructured_ingest/test-ingest-dest.sh @@ -10,7 +10,9 @@ export OMP_THREAD_LIMIT=1 all_tests=( 'azure.sh' + 'azure-cognitive-search.sh' 'box.sh' + 'delta-table.sh' 'dropbox.sh' 'gcs.sh' 's3.sh' diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e95996b1c6..efd541c3cf 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.29-dev10" # pragma: no cover +__version__ = "0.10.29-dev11" # pragma: no cover diff --git a/unstructured/ingest/connector/azure_cognitive_search.py b/unstructured/ingest/connector/azure_cognitive_search.py index 57b9f23237..5fa6f5e09e 100644 --- a/unstructured/ingest/connector/azure_cognitive_search.py +++ b/unstructured/ingest/connector/azure_cognitive_search.py @@ -61,6 +61,12 @@ def conform_dict(self, data: dict) -> None: data["metadata"]["data_source"]["version"] = str(version) if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator) + if permissions_data := ( + data.get("metadata", {}).get("data_source", {}).get("permissions_data") + ): + data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) + if links := data.get("metadata", {}).get("links"): + data["metadata"]["links"] = [json.dumps(link) for link in links] if last_modified := data.get("metadata", {}).get("last_modified"): data["metadata"]["last_modified"] = parser.parse(last_modified).strftime( "%Y-%m-%dT%H:%M:%S.%fZ",