Skip to content

Commit

Permalink
Merge branch 'main' into yuming/dont_pass_empty_str_to_tesseract
Browse files Browse the repository at this point in the history
  • Loading branch information
yuming-long authored Nov 3, 2023
2 parents d3b739e + d09c8c0 commit 1d524c8
Show file tree
Hide file tree
Showing 15 changed files with 90 additions and 31 deletions.
5 changes: 1 addition & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
## 0.10.29-dev10
## 0.10.29-dev11

### Enhancements

* **Add doctype field to CI metric functions** Adds a doctype column to the ingest metric sheets for use in subsequent aggregations.
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
* **Separate chipper tests** Chipper tests are long-running and require special access, so the tests have been separated into their own file under their own marker, and now have a separate `make` target.
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@
"name": "date_processed",
"type": "Edm.DateTimeOffset"
},
{
"name": "permissions_data",
"type": "Edm.String"
},
{
"name": "record_locator",
"type": "Edm.String"
Expand Down Expand Up @@ -114,6 +118,10 @@
"name": "page_number",
"type": "Edm.String"
},
{
"name": "links",
"type": "Collection(Edm.String)"
},
{
"name": "url",
"type": "Edm.String"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
OUTPUT_FOLDER_NAME=azure-cog-search-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}

DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)"
# The vector configs on the schema currently only exist on versions:
Expand Down Expand Up @@ -65,17 +67,14 @@ fi

RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
s3 \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--strategy fast \
--preserve-downloads \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--anonymous \
--work-dir "$WORK_DIR" \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
azure-cognitive-search \
--key "$AZURE_SEARCH_API_KEY" \
--endpoint "$AZURE_SEARCH_ENDPOINT" \
Expand Down
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/azure.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
Expand Down
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/box.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
#PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
# local \
# --num-processes "$max_processes" \
# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
# --output-dir "$OUTPUT_DIR" \
# --strategy fast \
# --verbose \
Expand Down
43 changes: 43 additions & 0 deletions test_unstructured_ingest/dest/delta-table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash

set -e

SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}

# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh

function cleanup() {
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
}

trap cleanup EXIT

PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
delta-table \
--write-column json_data \
--table-uri "$DESTINATION_TABLE"

python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/dropbox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
Expand Down
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/gcs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
Expand Down
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/s3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
"name": "date_processed",
"type": "Edm.DateTimeOffset"
},
{
"name": "permissions_data",
"type": "Edm.String"
},
{
"name": "record_locator",
"type": "Edm.String"
Expand Down Expand Up @@ -117,6 +121,10 @@
"name": "page_number",
"type": "Edm.String"
},
{
"name": "links",
"type": "Collection(Edm.String)"
},
{
"name": "page_name",
"type": "Edm.String"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@ def run_check(table_uri):
table_uri=table_uri,
)

assert len(delta_table.to_pandas()) == 10
expected_rows = 5
found_rows = len(delta_table.to_pandas())
print(
f"Checking if expected number of rows ({expected_rows}) "
f"matches how many were found: {found_rows}"
)
assert (
expected_rows == found_rows
), f"expected number of rows doesn't match how many were found: {expected_rows}/{found_rows}"
print("table check complete")


Expand Down
9 changes: 1 addition & 8 deletions test_unstructured_ingest/src/delta-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}

Expand All @@ -23,7 +22,6 @@ fi
source "$SCRIPT_DIR"/cleanup.sh

function cleanup() {
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
Expand All @@ -44,13 +42,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
--preserve-downloads \
--verbose \
--work-dir "$WORK_DIR" \
delta-table \
--write-column json_data \
--table-uri "$DESTINATION_TABLE"
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"

"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
2 changes: 2 additions & 0 deletions test_unstructured_ingest/test-ingest-dest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ export OMP_THREAD_LIMIT=1

all_tests=(
'azure.sh'
'azure-cognitive-search.sh'
'box.sh'
'delta-table.sh'
'dropbox.sh'
'gcs.sh'
's3.sh'
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.29-dev10" # pragma: no cover
__version__ = "0.10.29-dev11" # pragma: no cover
6 changes: 6 additions & 0 deletions unstructured/ingest/connector/azure_cognitive_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ def conform_dict(self, data: dict) -> None:
data["metadata"]["data_source"]["version"] = str(version)
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
if permissions_data := (
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
):
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
if links := data.get("metadata", {}).get("links"):
data["metadata"]["links"] = [json.dumps(link) for link in links]
if last_modified := data.get("metadata", {}).get("last_modified"):
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ",
Expand Down

0 comments on commit 1d524c8

Please sign in to comment.