From 668bd2967eecd70484ba67f4116caa5b9f1f4f6d Mon Sep 17 00:00:00 2001
From: cragwolfe <crag@unstructured.io>
Date: Thu, 2 Nov 2023 16:31:04 -0700
Subject: [PATCH 1/2] chore: update CHANGELOG.md (#1997)

Remove bullets not related to end-user consumption of the unstructured
library.

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
---
 CHANGELOG.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 69e79b13f2..be30d57ab1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,6 @@
 
 ### Enhancements
 
-* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
-* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
-* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
 * **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
 * **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
 * **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.

From d09c8c0cab59ee730e63db564eaa84773e6a9148 Mon Sep 17 00:00:00 2001
From: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
Date: Fri, 3 Nov 2023 08:46:56 -0400
Subject: [PATCH 2/2] test: update ingest dest tests to follow set pattern
 (#1991)

### Description
Update all destination tests to match pattern:
* Don't omit any metadata to check full schema
* Move azure cognitive dest test from src to dest
* Split delta table test into seperate src and dest tests
* Fix azure cognitive search and add to dest tests being run (wasn't
being run originally)
---
 CHANGELOG.md                                  |  2 +-
 .../azure_cognitive_sample_index_schema.json  |  8 ++++
 .../{src => dest}/azure-cognitive-search.sh   | 23 +++++-----
 test_unstructured_ingest/dest/azure.sh        |  1 -
 test_unstructured_ingest/dest/box.sh          |  1 -
 test_unstructured_ingest/dest/delta-table.sh  | 43 +++++++++++++++++++
 test_unstructured_ingest/dest/dropbox.sh      |  1 -
 test_unstructured_ingest/dest/gcs.sh          |  1 -
 test_unstructured_ingest/dest/s3.sh           |  1 -
 .../files/azure_cognitive_index_schema.json   |  8 ++++
 .../python/test-ingest-delta-table-output.py  | 10 ++++-
 test_unstructured_ingest/src/delta-table.sh   |  9 +---
 test_unstructured_ingest/test-ingest-dest.sh  |  2 +
 unstructured/__version__.py                   |  2 +-
 .../connector/azure_cognitive_search.py       |  6 +++
 15 files changed, 90 insertions(+), 28 deletions(-)
 rename test_unstructured_ingest/{src => dest}/azure-cognitive-search.sh (87%)
 create mode 100755 test_unstructured_ingest/dest/delta-table.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be30d57ab1..ebe7f95f84 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.29-dev10
+## 0.10.29-dev11
 
 ### Enhancements
 
diff --git a/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json b/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json
index 3b6e55e568..593d356690 100644
--- a/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json
+++ b/docs/source/ingest/destination_connectors/azure_cognitive_sample_index_schema.json
@@ -82,6 +82,10 @@
               "name": "date_processed",
               "type": "Edm.DateTimeOffset"
             },
+            {
+              "name": "permissions_data",
+              "type": "Edm.String"
+            },
             {
               "name": "record_locator",
               "type": "Edm.String"
@@ -114,6 +118,10 @@
           "name": "page_number",
           "type": "Edm.String"
         },
+        {
+          "name": "links",
+          "type": "Collection(Edm.String)"
+        },
         {
           "name": "url",
           "type": "Edm.String"
diff --git a/test_unstructured_ingest/src/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh
similarity index 87%
rename from test_unstructured_ingest/src/azure-cognitive-search.sh
rename to test_unstructured_ingest/dest/azure-cognitive-search.sh
index 8e6224bd9b..debef215bc 100755
--- a/test_unstructured_ingest/src/azure-cognitive-search.sh
+++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh
@@ -5,10 +5,12 @@ set -e
 SRC_PATH=$(dirname "$(realpath "$0")")
 SCRIPT_DIR=$(dirname "$SRC_PATH")
 cd "$SCRIPT_DIR"/.. || exit 1
-OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest
 OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
 OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
 WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
+OUTPUT_FOLDER_NAME=azure-cog-search-dest
+max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
+
 DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
 DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)"
 # The vector configs on the schema currently only exist on versions:
@@ -65,17 +67,14 @@ fi
 
 RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
-  s3 \
-  --download-dir "$DOWNLOAD_DIR" \
-  --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
-  --strategy fast \
-  --preserve-downloads \
-  --reprocess \
-  --output-dir "$OUTPUT_DIR" \
-  --verbose \
-  --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
-  --anonymous \
-  --work-dir "$WORK_DIR" \
+  local \
+    --num-processes "$max_processes" \
+    --output-dir "$OUTPUT_DIR" \
+    --strategy fast \
+    --verbose \
+    --reprocess \
+    --input-path example-docs/fake-memo.pdf \
+    --work-dir "$WORK_DIR" \
   azure-cognitive-search \
   --key "$AZURE_SEARCH_API_KEY" \
   --endpoint "$AZURE_SEARCH_ENDPOINT" \
diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh
index c5bb749000..12b2c5c45c 100755
--- a/test_unstructured_ingest/dest/azure.sh
+++ b/test_unstructured_ingest/dest/azure.sh
@@ -39,7 +39,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     local \
     --num-processes "$max_processes" \
-    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
     --strategy fast \
     --verbose \
diff --git a/test_unstructured_ingest/dest/box.sh b/test_unstructured_ingest/dest/box.sh
index cdbd97c94c..6e44c106d7 100755
--- a/test_unstructured_ingest/dest/box.sh
+++ b/test_unstructured_ingest/dest/box.sh
@@ -41,7 +41,6 @@
 #PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
 #    local \
 #    --num-processes "$max_processes" \
-#    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
 #    --output-dir "$OUTPUT_DIR" \
 #    --strategy fast \
 #    --verbose \
diff --git a/test_unstructured_ingest/dest/delta-table.sh b/test_unstructured_ingest/dest/delta-table.sh
new file mode 100755
index 0000000000..e33a7f1108
--- /dev/null
+++ b/test_unstructured_ingest/dest/delta-table.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+set -e
+
+SRC_PATH=$(dirname "$(realpath "$0")")
+SCRIPT_DIR=$(dirname "$SRC_PATH")
+cd "$SCRIPT_DIR"/.. || exit 1
+OUTPUT_FOLDER_NAME=delta-table-dest
+OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
+WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
+DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
+DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
+max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
+CI=${CI:-"false"}
+
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR"/cleanup.sh
+
+function cleanup() {
+  cleanup_dir "$DESTINATION_TABLE"
+  cleanup_dir "$OUTPUT_DIR"
+  cleanup_dir "$WORK_DIR"
+  if [ "$CI" == "true" ]; then
+    cleanup_dir "$DOWNLOAD_DIR"
+  fi
+}
+
+trap cleanup EXIT
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    local \
+    --num-processes "$max_processes" \
+    --output-dir "$OUTPUT_DIR" \
+    --strategy fast \
+    --verbose \
+    --reprocess \
+    --input-path example-docs/fake-memo.pdf \
+    --work-dir "$WORK_DIR" \
+    delta-table \
+    --write-column json_data \
+    --table-uri "$DESTINATION_TABLE"
+
+python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh
index 60a011c3e7..32184b7261 100755
--- a/test_unstructured_ingest/dest/dropbox.sh
+++ b/test_unstructured_ingest/dest/dropbox.sh
@@ -61,7 +61,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     local \
     --num-processes "$max_processes" \
-    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
     --strategy fast \
     --verbose \
diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh
index c5bf144cbf..c65254e4fb 100755
--- a/test_unstructured_ingest/dest/gcs.sh
+++ b/test_unstructured_ingest/dest/gcs.sh
@@ -44,7 +44,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     local \
     --num-processes "$max_processes" \
-    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
     --strategy fast \
     --verbose \
diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh
index 89540070c8..4b2ed33028 100755
--- a/test_unstructured_ingest/dest/s3.sh
+++ b/test_unstructured_ingest/dest/s3.sh
@@ -34,7 +34,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     local \
     --num-processes "$max_processes" \
-    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
     --strategy fast \
     --verbose \
diff --git a/test_unstructured_ingest/files/azure_cognitive_index_schema.json b/test_unstructured_ingest/files/azure_cognitive_index_schema.json
index d77fd8da32..522250f98a 100644
--- a/test_unstructured_ingest/files/azure_cognitive_index_schema.json
+++ b/test_unstructured_ingest/files/azure_cognitive_index_schema.json
@@ -81,6 +81,10 @@
               "name": "date_processed",
               "type": "Edm.DateTimeOffset"
             },
+            {
+              "name": "permissions_data",
+              "type": "Edm.String"
+            },
             {
               "name": "record_locator",
               "type": "Edm.String"
@@ -117,6 +121,10 @@
           "name": "page_number",
           "type": "Edm.String"
         },
+        {
+          "name": "links",
+          "type": "Collection(Edm.String)"
+        },
         {
           "name": "page_name",
           "type": "Edm.String"
diff --git a/test_unstructured_ingest/python/test-ingest-delta-table-output.py b/test_unstructured_ingest/python/test-ingest-delta-table-output.py
index 26c873ecd7..ae08c18c39 100755
--- a/test_unstructured_ingest/python/test-ingest-delta-table-output.py
+++ b/test_unstructured_ingest/python/test-ingest-delta-table-output.py
@@ -10,7 +10,15 @@ def run_check(table_uri):
         table_uri=table_uri,
     )
 
-    assert len(delta_table.to_pandas()) == 10
+    expected_rows = 5
+    found_rows = len(delta_table.to_pandas())
+    print(
+        f"Checking if expected number of rows ({expected_rows}) "
+        f"matches how many were found: {found_rows}"
+    )
+    assert (
+        expected_rows == found_rows
+    ), f"expected number of rows doesn't match how many were found: {expected_rows}/{found_rows}"
     print("table check complete")
 
 
diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh
index e926959665..13b9e58135 100755
--- a/test_unstructured_ingest/src/delta-table.sh
+++ b/test_unstructured_ingest/src/delta-table.sh
@@ -10,7 +10,6 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
 OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
 WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
 DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
-DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
 max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
 CI=${CI:-"false"}
 
@@ -23,7 +22,6 @@ fi
 source "$SCRIPT_DIR"/cleanup.sh
 
 function cleanup() {
-  cleanup_dir "$DESTINATION_TABLE"
   cleanup_dir "$OUTPUT_DIR"
   cleanup_dir "$WORK_DIR"
   if [ "$CI" == "true" ]; then
@@ -44,13 +42,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
     --storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
     --preserve-downloads \
     --verbose \
-    --work-dir "$WORK_DIR" \
-    delta-table \
-    --write-column json_data \
-    --table-uri "$DESTINATION_TABLE"
+    --work-dir "$WORK_DIR"
 
 "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
 
-python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
-
 "$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh
index 707dc5a192..421702e2b9 100755
--- a/test_unstructured_ingest/test-ingest-dest.sh
+++ b/test_unstructured_ingest/test-ingest-dest.sh
@@ -10,7 +10,9 @@ export OMP_THREAD_LIMIT=1
 
 all_tests=(
   'azure.sh'
+  'azure-cognitive-search.sh'
   'box.sh'
+  'delta-table.sh'
   'dropbox.sh'
   'gcs.sh'
   's3.sh'
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index e95996b1c6..efd541c3cf 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.29-dev10"  # pragma: no cover
+__version__ = "0.10.29-dev11"  # pragma: no cover
diff --git a/unstructured/ingest/connector/azure_cognitive_search.py b/unstructured/ingest/connector/azure_cognitive_search.py
index 57b9f23237..5fa6f5e09e 100644
--- a/unstructured/ingest/connector/azure_cognitive_search.py
+++ b/unstructured/ingest/connector/azure_cognitive_search.py
@@ -61,6 +61,12 @@ def conform_dict(self, data: dict) -> None:
             data["metadata"]["data_source"]["version"] = str(version)
         if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
             data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
+        if permissions_data := (
+            data.get("metadata", {}).get("data_source", {}).get("permissions_data")
+        ):
+            data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
+        if links := data.get("metadata", {}).get("links"):
+            data["metadata"]["links"] = [json.dumps(link) for link in links]
         if last_modified := data.get("metadata", {}).get("last_modified"):
             data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
                 "%Y-%m-%dT%H:%M:%S.%fZ",