From c48a670711674946c946cf60095ddc26a260819d Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 17 Oct 2023 14:25:40 -0400 Subject: [PATCH 01/38] refactor writers into their own directory --- unstructured/ingest/connector/fsspec.py | 28 ++++--- unstructured/ingest/runner/writers.py | 83 ------------------- .../ingest/runner/writers/__init__.py | 13 +++ .../runner/writers/azure_cognitive_search.py | 21 +++++ .../ingest/runner/writers/delta_table.py | 22 +++++ unstructured/ingest/runner/writers/s3.py | 27 ++++++ 6 files changed, 101 insertions(+), 93 deletions(-) delete mode 100644 unstructured/ingest/runner/writers.py create mode 100644 unstructured/ingest/runner/writers/__init__.py create mode 100644 unstructured/ingest/runner/writers/azure_cognitive_search.py create mode 100644 unstructured/ingest/runner/writers/delta_table.py create mode 100644 unstructured/ingest/runner/writers/s3.py diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 1aaf2aeac5..2ffbebca55 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -2,7 +2,7 @@ import os import typing as t from contextlib import suppress -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path, PurePath from unstructured.ingest.compression_support import ( @@ -20,6 +20,7 @@ IngestDocCleanupMixin, SourceConnectorCleanupMixin, SourceMetadata, + WriteConfig, ) from unstructured.ingest.logger import logger from unstructured.utils import ( @@ -221,9 +222,15 @@ def get_ingest_docs(self): return docs +@dataclass +class FsspecWriteConfig(WriteConfig): + write_text_kwargs: t.Dict[str, t.Any] = field(default_factory=dict) + + @dataclass class FsspecDestinationConnector(BaseDestinationConnector): connector_config: SimpleFsspecConfig + write_config: FsspecWriteConfig def initialize(self): from fsspec import AbstractFileSystem, get_filesystem_class @@ -249,15 +256,16 @@ def write_dict( logger.info(f"Writing content using filesystem: {type(fs).__name__}") - output_folder = self.connector_config.path_without_protocol - output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator - filename = ( - filename.strip(os.sep) if filename else filename - ) # Make sure filename doesn't begin with file seperator - output_path = str(PurePath(output_folder, filename)) if filename else output_folder - full_output_path = f"s3://{output_path}" - logger.debug(f"uploading content to {full_output_path}") - fs.write_text(full_output_path, json.dumps(json_list, indent=indent), encoding=encoding) + dest_folder = self.connector_config.path_without_protocol + dest_output_path = str(PurePath(dest_folder, filename)) if filename else dest_folder + full_dest_path = f"{self.connector_config.protocol}://{dest_output_path}" + logger.debug(f"uploading content to {full_dest_path}") + fs.write_text( + full_dest_path, + json.dumps(json_list, indent=indent), + encoding=encoding, + **self.write_config.write_text_kwargs, + ) def write(self, docs: t.List[BaseIngestDoc]) -> None: for doc in docs: diff --git a/unstructured/ingest/runner/writers.py b/unstructured/ingest/runner/writers.py deleted file mode 100644 index 791bbfeefc..0000000000 --- a/unstructured/ingest/runner/writers.py +++ /dev/null @@ -1,83 +0,0 @@ -import typing as t -from pathlib import Path - -from unstructured.ingest.interfaces import WriteConfig -from unstructured.utils import requires_dependencies - - -@requires_dependencies(["s3fs", "fsspec"], extras="s3") -def s3_writer( - remote_url: str, - anonymous: bool, - endpoint_url: t.Optional[str] = None, - verbose: bool = False, - **kwargs, -): - from unstructured.ingest.connector.s3 import ( - S3DestinationConnector, - SimpleS3Config, - ) - - access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} - if endpoint_url: - access_kwargs["endpoint_url"] = endpoint_url - - return S3DestinationConnector( - write_config=WriteConfig(), - connector_config=SimpleS3Config( - remote_url=remote_url, - access_kwargs=access_kwargs, - ), - ) - - -@requires_dependencies(["azure"], extras="azure-cognitive-search") -def azure_cognitive_search_writer( - endpoint: str, - key: str, - index: str, - **kwargs, -): - from unstructured.ingest.connector.azure_cognitive_search import ( - AzureCognitiveSearchDestinationConnector, - AzureCognitiveSearchWriteConfig, - SimpleAzureCognitiveSearchStorageConfig, - ) - - return AzureCognitiveSearchDestinationConnector( - write_config=AzureCognitiveSearchWriteConfig( - index=index, - ), - connector_config=SimpleAzureCognitiveSearchStorageConfig( - endpoint=endpoint, - key=key, - ), - ) - - -@requires_dependencies(["deltalake"], extras="delta-table") -def delta_table_writer( - table_uri: t.Union[str, Path], - write_column: str, - mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", - **kwargs, -): - from unstructured.ingest.connector.delta_table import ( - DeltaTableDestinationConnector, - DeltaTableWriteConfig, - SimpleDeltaTableConfig, - ) - - return DeltaTableDestinationConnector( - write_config=DeltaTableWriteConfig(write_column=write_column, mode=mode), - connector_config=SimpleDeltaTableConfig( - table_uri=table_uri, - ), - ) - - -writer_map: t.Dict[str, t.Callable] = { - "s3": s3_writer, - "delta_table": delta_table_writer, - "azure_cognitive_search": azure_cognitive_search_writer, -} diff --git a/unstructured/ingest/runner/writers/__init__.py b/unstructured/ingest/runner/writers/__init__.py new file mode 100644 index 0000000000..2abe99e1e5 --- /dev/null +++ b/unstructured/ingest/runner/writers/__init__.py @@ -0,0 +1,13 @@ +from typing import t + +from .azure_cognitive_search import azure_cognitive_search_writer +from .delta_table import delta_table_writer +from .s3 import s3_writer + +writer_map: t.Dict[str, t.Callable] = { + "s3": s3_writer, + "delta_table": delta_table_writer, + "azure_cognitive_search": azure_cognitive_search_writer, +} + +__all__ = ["writer_map"] diff --git a/unstructured/ingest/runner/writers/azure_cognitive_search.py b/unstructured/ingest/runner/writers/azure_cognitive_search.py new file mode 100644 index 0000000000..00bc7e94c7 --- /dev/null +++ b/unstructured/ingest/runner/writers/azure_cognitive_search.py @@ -0,0 +1,21 @@ +def azure_cognitive_search_writer( + endpoint: str, + key: str, + index: str, + **kwargs, +): + from unstructured.ingest.connector.azure_cognitive_search import ( + AzureCognitiveSearchDestinationConnector, + AzureCognitiveSearchWriteConfig, + SimpleAzureCognitiveSearchStorageConfig, + ) + + return AzureCognitiveSearchDestinationConnector( + write_config=AzureCognitiveSearchWriteConfig( + index=index, + ), + connector_config=SimpleAzureCognitiveSearchStorageConfig( + endpoint=endpoint, + key=key, + ), + ) diff --git a/unstructured/ingest/runner/writers/delta_table.py b/unstructured/ingest/runner/writers/delta_table.py new file mode 100644 index 0000000000..82513e7879 --- /dev/null +++ b/unstructured/ingest/runner/writers/delta_table.py @@ -0,0 +1,22 @@ +import typing as t +from pathlib import Path + + +def delta_table_writer( + table_uri: t.Union[str, Path], + write_column: str, + mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", + **kwargs, +): + from unstructured.ingest.connector.delta_table import ( + DeltaTableDestinationConnector, + DeltaTableWriteConfig, + SimpleDeltaTableConfig, + ) + + return DeltaTableDestinationConnector( + write_config=DeltaTableWriteConfig(write_column=write_column, mode=mode), + connector_config=SimpleDeltaTableConfig( + table_uri=table_uri, + ), + ) diff --git a/unstructured/ingest/runner/writers/s3.py b/unstructured/ingest/runner/writers/s3.py new file mode 100644 index 0000000000..3c358687fe --- /dev/null +++ b/unstructured/ingest/runner/writers/s3.py @@ -0,0 +1,27 @@ +import typing as t + + +def s3_writer( + remote_url: str, + anonymous: bool, + endpoint_url: t.Optional[str] = None, + verbose: bool = False, + **kwargs, +): + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + from unstructured.ingest.connector.s3 import ( + S3DestinationConnector, + SimpleS3Config, + ) + + access_kwargs: t.Dict[str, t.Any] = {"anon": anonymous} + if endpoint_url: + access_kwargs["endpoint_url"] = endpoint_url + + return S3DestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleS3Config( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + ) From 054081ce9e8ad4940130eab928a992d3b6f69c4b Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 17 Oct 2023 14:33:51 -0400 Subject: [PATCH 02/38] Add all other fsspec writers --- .../ingest/runner/writers/__init__.py | 14 ++++++-- unstructured/ingest/runner/writers/azure.py | 36 +++++++++++++++++++ .../runner/writers/azure_cognitive_search.py | 5 ++- unstructured/ingest/runner/writers/box.py | 28 +++++++++++++++ .../ingest/runner/writers/delta_table.py | 4 ++- unstructured/ingest/runner/writers/dropbox.py | 24 +++++++++++++ unstructured/ingest/runner/writers/gcs.py | 23 ++++++++++++ unstructured/ingest/runner/writers/s3.py | 4 ++- 8 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 unstructured/ingest/runner/writers/azure.py create mode 100644 unstructured/ingest/runner/writers/box.py create mode 100644 unstructured/ingest/runner/writers/dropbox.py create mode 100644 unstructured/ingest/runner/writers/gcs.py diff --git a/unstructured/ingest/runner/writers/__init__.py b/unstructured/ingest/runner/writers/__init__.py index 2abe99e1e5..701d77dbe2 100644 --- a/unstructured/ingest/runner/writers/__init__.py +++ b/unstructured/ingest/runner/writers/__init__.py @@ -1,13 +1,21 @@ -from typing import t +import typing as t +from .azure import azure_writer from .azure_cognitive_search import azure_cognitive_search_writer +from .box import box_writer from .delta_table import delta_table_writer +from .dropbox import dropbox_writer +from .gcs import gcs_writer from .s3 import s3_writer writer_map: t.Dict[str, t.Callable] = { - "s3": s3_writer, - "delta_table": delta_table_writer, + "azure": azure_writer, "azure_cognitive_search": azure_cognitive_search_writer, + "box": box_writer, + "delta_table": delta_table_writer, + "dropbox": dropbox_writer, + "gcs": gcs_writer, + "s3": s3_writer, } __all__ = ["writer_map"] diff --git a/unstructured/ingest/runner/writers/azure.py b/unstructured/ingest/runner/writers/azure.py new file mode 100644 index 0000000000..60bf39dbed --- /dev/null +++ b/unstructured/ingest/runner/writers/azure.py @@ -0,0 +1,36 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def azure_writer( + remote_url: str, + account_name: t.Optional[str], + account_key: t.Optional[str], + connection_string: t.Optional[str], + overwrite: bool = False, + verbose: bool = False, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.azure import ( + AzureBlobStorageDestinationConnector, + SimpleAzureBlobStorageConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + if account_name: + access_kwargs = { + "account_name": account_name, + "account_key": account_key, + } + elif connection_string: + access_kwargs = {"connection_string": connection_string} + else: + access_kwargs = {} + + return AzureBlobStorageDestinationConnector( + write_config=FsspecWriteConfig(write_text_kwargs={"overwrite": overwrite}), + connector_config=SimpleAzureBlobStorageConfig( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + ) diff --git a/unstructured/ingest/runner/writers/azure_cognitive_search.py b/unstructured/ingest/runner/writers/azure_cognitive_search.py index 00bc7e94c7..9d69a16d04 100644 --- a/unstructured/ingest/runner/writers/azure_cognitive_search.py +++ b/unstructured/ingest/runner/writers/azure_cognitive_search.py @@ -1,9 +1,12 @@ +from unstructured.ingest.interfaces import BaseDestinationConnector + + def azure_cognitive_search_writer( endpoint: str, key: str, index: str, **kwargs, -): +) -> BaseDestinationConnector: from unstructured.ingest.connector.azure_cognitive_search import ( AzureCognitiveSearchDestinationConnector, AzureCognitiveSearchWriteConfig, diff --git a/unstructured/ingest/runner/writers/box.py b/unstructured/ingest/runner/writers/box.py new file mode 100644 index 0000000000..7a8231c8c9 --- /dev/null +++ b/unstructured/ingest/runner/writers/box.py @@ -0,0 +1,28 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def box_writer( + remote_url: str, + box_app_config: t.Optional[str], + verbose: bool = False, +) -> BaseDestinationConnector: + import boxsdk + + from unstructured.ingest.connector.box import ( + BoxDestinationConnector, + SimpleBoxConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + access_kwargs: t.Dict[str, t.Any] = {"box_app_config": box_app_config} + if verbose: + access_kwargs["client_type"] = boxsdk.LoggingClient + return BoxDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleBoxConfig( + remote_url=remote_url, + access_kwargs=access_kwargs, + ), + ) diff --git a/unstructured/ingest/runner/writers/delta_table.py b/unstructured/ingest/runner/writers/delta_table.py index 82513e7879..f0771aa1e7 100644 --- a/unstructured/ingest/runner/writers/delta_table.py +++ b/unstructured/ingest/runner/writers/delta_table.py @@ -1,13 +1,15 @@ import typing as t from pathlib import Path +from unstructured.ingest.interfaces import BaseDestinationConnector + def delta_table_writer( table_uri: t.Union[str, Path], write_column: str, mode: t.Literal["error", "append", "overwrite", "ignore"] = "error", **kwargs, -): +) -> BaseDestinationConnector: from unstructured.ingest.connector.delta_table import ( DeltaTableDestinationConnector, DeltaTableWriteConfig, diff --git a/unstructured/ingest/runner/writers/dropbox.py b/unstructured/ingest/runner/writers/dropbox.py new file mode 100644 index 0000000000..2828d33c72 --- /dev/null +++ b/unstructured/ingest/runner/writers/dropbox.py @@ -0,0 +1,24 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def dropbox_writer( + remote_url: str, + token: t.Optional[str], + verbose: bool = False, + **kwargs, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.dropbox import ( + DropboxDestinationConnector, + SimpleDropboxConfig, + ) + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + + return DropboxDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleDropboxConfig( + remote_url=remote_url, + access_kwargs={"token": token}, + ), + ) diff --git a/unstructured/ingest/runner/writers/gcs.py b/unstructured/ingest/runner/writers/gcs.py new file mode 100644 index 0000000000..c221fe959f --- /dev/null +++ b/unstructured/ingest/runner/writers/gcs.py @@ -0,0 +1,23 @@ +import typing as t + +from unstructured.ingest.interfaces import BaseDestinationConnector + + +def gcs_writer( + remote_url: str, + token: t.Optional[str], + verbose: bool = False, +) -> BaseDestinationConnector: + from unstructured.ingest.connector.fsspec import FsspecWriteConfig + from unstructured.ingest.connector.gcs import ( + GcsDestinationConnector, + SimpleGcsConfig, + ) + + return GcsDestinationConnector( + write_config=FsspecWriteConfig(), + connector_config=SimpleGcsConfig( + remote_url=remote_url, + access_kwargs={"token": token}, + ), + ) diff --git a/unstructured/ingest/runner/writers/s3.py b/unstructured/ingest/runner/writers/s3.py index 3c358687fe..27bc1dd863 100644 --- a/unstructured/ingest/runner/writers/s3.py +++ b/unstructured/ingest/runner/writers/s3.py @@ -1,5 +1,7 @@ import typing as t +from unstructured.ingest.interfaces import BaseDestinationConnector + def s3_writer( remote_url: str, @@ -7,7 +9,7 @@ def s3_writer( endpoint_url: t.Optional[str] = None, verbose: bool = False, **kwargs, -): +) -> BaseDestinationConnector: from unstructured.ingest.connector.fsspec import FsspecWriteConfig from unstructured.ingest.connector.s3 import ( S3DestinationConnector, From eb199bd8263ce005c59c4c5ab662253addb97de8 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 17 Oct 2023 15:54:35 -0400 Subject: [PATCH 03/38] Finish azure dest with e2e test --- .../test-ingest-azure-dest.sh | 56 +++++++++++++++++++ unstructured/ingest/cli/cmds/__init__.py | 4 ++ unstructured/ingest/cli/cmds/azure.py | 11 +++- unstructured/ingest/cli/cmds/fsspec.py | 11 +++- unstructured/ingest/connector/azure.py | 1 + unstructured/ingest/connector/local.py | 7 +++ unstructured/ingest/runner/writers/azure.py | 6 +- 7 files changed, 91 insertions(+), 5 deletions(-) create mode 100755 test_unstructured_ingest/test-ingest-azure-dest.sh diff --git a/test_unstructured_ingest/test-ingest-azure-dest.sh b/test_unstructured_ingest/test-ingest-azure-dest.sh new file mode 100755 index 0000000000..1c24bd5dfd --- /dev/null +++ b/test_unstructured_ingest/test-ingest-azure-dest.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=azure-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} + +if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then + echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." + exit 0 +fi + +CONTAINER=utic-ingest-test-fixtures-output +DIRECTORY=$(date +%s) +REMOTE_URL="abfs://$CONTAINER/$DIRECTORY/" + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + + echo "deleting azure storage blob directory $CONTAINER/$DIRECTORY" + az storage fs directory delete -f "$CONTAINER" -n "$DIRECTORY" --connection-string "$AZURE_DEST_CONNECTION_STR" --yes + +} +trap cleanup EXIT + +# Create directory to use for testing +az storage fs directory create -f "$CONTAINER" --n "$DIRECTORY" --connection-string "$AZURE_DEST_CONNECTION_STR" + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + azure \ + --overwrite \ + --remote-url "$REMOTE_URL" \ + --connection-string "$AZURE_DEST_CONNECTION_STR" + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_azure=$(az storage blob list -c "$CONTAINER" --prefix "$DIRECTORY"/example-docs/ --connection-string "$AZURE_DEST_CONNECTION_STR" | jq 'length') +if [ "$num_files_in_azure" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to azure, but found $num_files_in_azure files." + exit 1 +fi diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index ee494e4858..bfc4019eab 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -6,6 +6,7 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd from .airtable import get_base_src_cmd as airtable_base_src_cmd +from .azure import get_base_dest_cmd as azure_base_dest_cmd from .azure import get_base_src_cmd as azure_base_src_cmd from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd from .biomed import get_base_src_cmd as biomed_base_src_cmd @@ -16,6 +17,7 @@ from .discord import get_base_src_cmd as discord_base_src_cmd from .dropbox import get_base_src_cmd as dropbox_base_src_cmd from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd +from .fsspec import get_base_dest_cmd as fsspec_base_dest_cmd from .fsspec import get_base_src_cmd as fsspec_base_src_cmd from .gcs import get_base_src_cmd as gcs_base_src_cmd from .github import get_base_src_cmd as github_base_src_cmd @@ -76,6 +78,8 @@ ) base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ + azure_base_dest_cmd, + fsspec_base_dest_cmd, s3_base_dest_cmd, azure_cognitive_search_base_dest_cmd, delta_table_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/azure.py b/unstructured/ingest/cli/cmds/azure.py index 12537fe52a..133f840d57 100644 --- a/unstructured/ingest/cli/cmds/azure.py +++ b/unstructured/ingest/cli/cmds/azure.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "azure" + @dataclass class AzureCliConfig(BaseConfig, CliMixin): @@ -40,5 +42,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="azure", cli_config=AzureCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=AzureCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=AzureCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec.py b/unstructured/ingest/cli/cmds/fsspec.py index d081c45b01..e2d50a278d 100644 --- a/unstructured/ingest/cli/cmds/fsspec.py +++ b/unstructured/ingest/cli/cmds/fsspec.py @@ -1,6 +1,15 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd +CMD_NAME = "fsspec" + def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="fsspec", is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/connector/azure.py b/unstructured/ingest/connector/azure.py index 004ca782b4..4f80af2ce1 100644 --- a/unstructured/ingest/connector/azure.py +++ b/unstructured/ingest/connector/azure.py @@ -35,6 +35,7 @@ def __post_init__(self): self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc +@requires_dependencies(["adlfs", "fsspec"], extras="azure") @dataclass class AzureBlobStorageDestinationConnector(FsspecDestinationConnector): connector_config: SimpleAzureBlobStorageConfig diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index b23a1021de..f58d379ae9 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -39,6 +39,13 @@ class LocalIngestDoc(BaseIngestDoc): path: str registry_name: str = "local" + @property + def base_filename(self) -> t.Optional[str]: + download_path = str(Path(self.connector_config.input_path).resolve()) + full_path = str(self.filename) + base_path = full_path.replace(download_path, "") + return base_path + @property def filename(self): """The filename of the local file to be processed""" diff --git a/unstructured/ingest/runner/writers/azure.py b/unstructured/ingest/runner/writers/azure.py index 60bf39dbed..a0ba68ce7f 100644 --- a/unstructured/ingest/runner/writers/azure.py +++ b/unstructured/ingest/runner/writers/azure.py @@ -5,9 +5,9 @@ def azure_writer( remote_url: str, - account_name: t.Optional[str], - account_key: t.Optional[str], - connection_string: t.Optional[str], + account_name: t.Optional[str] = None, + account_key: t.Optional[str] = None, + connection_string: t.Optional[str] = None, overwrite: bool = False, verbose: bool = False, ) -> BaseDestinationConnector: From 9b89d10e11425813dbdce2f66b020e50f3a949fd Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 17 Oct 2023 18:55:19 -0400 Subject: [PATCH 04/38] Add s3 e2e test --- .../test-ingest-azure-dest.sh | 1 + .../test-ingest-s3-dest.sh | 47 +++++++++++++++++++ unstructured/ingest/cli/interfaces.py | 41 +++++++--------- unstructured/ingest/connector/s3.py | 1 + 4 files changed, 65 insertions(+), 25 deletions(-) create mode 100755 test_unstructured_ingest/test-ingest-s3-dest.sh diff --git a/test_unstructured_ingest/test-ingest-azure-dest.sh b/test_unstructured_ingest/test-ingest-azure-dest.sh index 1c24bd5dfd..f01eda5bcc 100755 --- a/test_unstructured_ingest/test-ingest-azure-dest.sh +++ b/test_unstructured_ingest/test-ingest-azure-dest.sh @@ -40,6 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ + --reprocess \ --input-path example-docs/fake-memo.pdf \ --work-dir "$WORK_DIR" \ azure \ diff --git a/test_unstructured_ingest/test-ingest-s3-dest.sh b/test_unstructured_ingest/test-ingest-s3-dest.sh new file mode 100755 index 0000000000..d2031f1291 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-s3-dest.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=s3-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_S3="s3://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(date +%s)/" + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + + if aws s3 ls "$DESTINATION_S3" --region us-east-2; then + echo "deleting destination s3 location: $DESTINATION_S3" + aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2 + fi + +} +trap cleanup EXIT + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + s3 \ + --anonymous \ + --remote-url "$DESTINATION_S3" + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l) +if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." + exit 1 +fi diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 0fd688e3e0..20156e46ef 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -473,32 +473,23 @@ def from_dict( doesn't require that as part of the field names in this class. It also checks if the CLI params are provided as intended. """ - - if ( - isinstance(kvs, dict) - and any( - [ - kvs["permissions_application_id"] - or kvs["permissions_client_cred"] - or kvs["permissions_tenant"], - ], - ) - and not all( - [ - kvs["permissions_application_id"] - and kvs["permissions_client_cred"] - and kvs["permissions_tenant"], - ], - ) - ): - raise ValueError( - "Please provide either none or all of the following optional values:\n" - "--permissions-application-id\n" - "--permissions-client-cred\n" - "--permissions-tenant", - ) - if isinstance(kvs, dict): + permissions_application_id = kvs.get("permissions_application_id") + permissions_client_cred = kvs.get("permissions_client_cred") + permissions_tenant = kvs.get("permissions_tenant") + permission_values = [ + permissions_application_id, + permissions_client_cred, + permissions_tenant, + ] + if any(permission_values) and not all(permission_values): + raise ValueError( + "Please provide either none or all of the following optional values:\n" + "--permissions-application-id\n" + "--permissions-client-cred\n" + "--permissions-tenant", + ) + new_kvs = { k[len("permissions_") :]: v # noqa: E203 for k, v in kvs.items() diff --git a/unstructured/ingest/connector/s3.py b/unstructured/ingest/connector/s3.py index b3699025f0..8b4ec7a350 100644 --- a/unstructured/ingest/connector/s3.py +++ b/unstructured/ingest/connector/s3.py @@ -34,6 +34,7 @@ def __post_init__(self): self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc +@requires_dependencies(["s3fs", "fsspec"], extras="s3") @dataclass class S3DestinationConnector(FsspecDestinationConnector): connector_config: SimpleS3Config From 8d21ab368368b4bbf5d76079cca072efc07e542d Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 17 Oct 2023 19:51:50 -0400 Subject: [PATCH 05/38] Add box dest connector --- .../test-ingest-box-dest.sh | 53 +++++++++++++++++++ unstructured/ingest/cli/cmds/__init__.py | 2 + unstructured/ingest/cli/cmds/box.py | 11 +++- unstructured/ingest/connector/box.py | 1 + unstructured/ingest/runner/writers/box.py | 2 + 5 files changed, 68 insertions(+), 1 deletion(-) create mode 100755 test_unstructured_ingest/test-ingest-box-dest.sh diff --git a/test_unstructured_ingest/test-ingest-box-dest.sh b/test_unstructured_ingest/test-ingest-box-dest.sh new file mode 100755 index 0000000000..cd8791e6e3 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-box-dest.sh @@ -0,0 +1,53 @@ +#TODO currently box api/sdk does not work to create folders and check for content similar to other fsspec ingest tests +##!/usr/bin/env bash +# +#set -e +# +#SCRIPT_DIR=$(dirname "$(realpath "$0")") +#cd "$SCRIPT_DIR"/.. || exit 1 +#OUTPUT_FOLDER_NAME=box-dest +#OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +#WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +#max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +#DESTINATION_BOX="box://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(date +%s)/" +# +#CI=${CI:-"false"} +# +#if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then +# echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." +# exit 0 +#fi +# +#if [ -z "$BOX_APP_CONFIG_PATH" ]; then +# # Create temporary service key file +# BOX_APP_CONFIG_PATH=$(mktemp) +# echo "$BOX_APP_CONFIG" >"$BOX_APP_CONFIG_PATH" +#fi +# +## shellcheck disable=SC1091 +#source "$SCRIPT_DIR"/cleanup.sh +#function cleanup() { +# cleanup_dir "$OUTPUT_DIR" +# cleanup_dir "$WORK_DIR" +# if [ "$CI" == "true" ]; then +# cleanup_dir "$DOWNLOAD_DIR" +# fi +#} +#trap cleanup EXIT +# +#PYTHONPATH=. ./unstructured/ingest/main.py \ +# local \ +# --num-processes "$max_processes" \ +# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ +# --output-dir "$OUTPUT_DIR" \ +# --strategy fast \ +# --verbose \ +# --reprocess \ +# --input-path example-docs/fake-memo.pdf \ +# --work-dir "$WORK_DIR" \ +# box \ +# --box-app-config "$BOX_APP_CONFIG_PATH" \ +# --remote-url "$DESTINATION_BOX" \ +# +## Simply check the number of files uploaded +#expected_num_files=1 diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index bfc4019eab..b2fa754b54 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -10,6 +10,7 @@ from .azure import get_base_src_cmd as azure_base_src_cmd from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd from .biomed import get_base_src_cmd as biomed_base_src_cmd +from .box import get_base_dest_cmd as box_base_dest_cmd from .box import get_base_src_cmd as box_base_src_cmd from .confluence import get_base_src_cmd as confluence_base_src_cmd from .delta_table import get_base_dest_cmd as delta_table_dest_cmd @@ -79,6 +80,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ azure_base_dest_cmd, + box_base_dest_cmd, fsspec_base_dest_cmd, s3_base_dest_cmd, azure_cognitive_search_base_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/box.py b/unstructured/ingest/cli/cmds/box.py index 35e3c58d9b..8a361152d3 100644 --- a/unstructured/ingest/cli/cmds/box.py +++ b/unstructured/ingest/cli/cmds/box.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "box" + @dataclass class BoxCliConfig(BaseConfig, CliMixin): @@ -27,5 +29,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="box", cli_config=BoxCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=BoxCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=BoxCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/connector/box.py b/unstructured/ingest/connector/box.py index 5c63ecd30d..39922f9192 100644 --- a/unstructured/ingest/connector/box.py +++ b/unstructured/ingest/connector/box.py @@ -61,6 +61,7 @@ def __post_init__(self): self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc +@requires_dependencies(["boxfs", "fsspec"], extras="box") @dataclass class BoxDestinationConnector(FsspecDestinationConnector): connector_config: SimpleBoxConfig diff --git a/unstructured/ingest/runner/writers/box.py b/unstructured/ingest/runner/writers/box.py index 7a8231c8c9..f23e273b9f 100644 --- a/unstructured/ingest/runner/writers/box.py +++ b/unstructured/ingest/runner/writers/box.py @@ -1,8 +1,10 @@ import typing as t from unstructured.ingest.interfaces import BaseDestinationConnector +from unstructured.utils import requires_dependencies +@requires_dependencies(["boxfs", "fsspec"], extras="box") def box_writer( remote_url: str, box_app_config: t.Optional[str], From 8d0ec9e664aa04926bf1df9c8733451297436a23 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Oct 2023 09:37:04 -0400 Subject: [PATCH 06/38] Add dropbox dest connector --- .../test-ingest-box-dest.sh | 3 +- .../test-ingest-dropbox-dest.sh | 77 +++++++++++++++++++ .../test-ingest-dropbox.sh | 2 +- unstructured/ingest/cli/cmds/__init__.py | 2 + unstructured/ingest/cli/cmds/dropbox.py | 11 ++- unstructured/ingest/connector/dropbox.py | 1 + unstructured/ingest/interfaces.py | 7 ++ 7 files changed, 100 insertions(+), 3 deletions(-) create mode 100755 test_unstructured_ingest/test-ingest-dropbox-dest.sh diff --git a/test_unstructured_ingest/test-ingest-box-dest.sh b/test_unstructured_ingest/test-ingest-box-dest.sh index cd8791e6e3..248c9b75eb 100755 --- a/test_unstructured_ingest/test-ingest-box-dest.sh +++ b/test_unstructured_ingest/test-ingest-box-dest.sh @@ -1,5 +1,6 @@ +#!/usr/bin/env bash #TODO currently box api/sdk does not work to create folders and check for content similar to other fsspec ingest tests -##!/usr/bin/env bash + # #set -e # diff --git a/test_unstructured_ingest/test-ingest-dropbox-dest.sh b/test_unstructured_ingest/test-ingest-dropbox-dest.sh new file mode 100755 index 0000000000..d8dac8598c --- /dev/null +++ b/test_unstructured_ingest/test-ingest-dropbox-dest.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=dropbox-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_DROPBOX="/test-output/$(date +%s)" + +if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then + echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" + echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" + exit 0 +fi + +# Get a new access token from Dropbox +DROPBOX_RESPONSE=$(curl -s https://api.dropbox.com/oauth2/token -d refresh_token="$DROPBOX_REFRESH_TOKEN" -d grant_type=refresh_token -d client_id="$DROPBOX_APP_KEY" -d client_secret="$DROPBOX_APP_SECRET") +DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<< "$DROPBOX_RESPONSE") + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + + echo "deleting test folder $DESTINATION_DROPBOX" + curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"path\":\"$DESTINATION_DROPBOX\"}" | jq +} +trap cleanup EXIT + +# Create new folder for test +echo "creating temp directory in dropbox for testing: $DESTINATION_DROPBOX" +response=$(curl -X POST -s -w "\n%{http_code}" https://api.dropboxapi.com/2/files/create_folder_v2 \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"autorename\":false,\"path\":\"$DESTINATION_DROPBOX\"}"); +http_code=$(tail -n1 <<< "$response") # get the last line +content=$(sed '$ d' <<< "$response") # get all but the last line which contains the status code + +if [ "$http_code" -ge 300 ]; then + echo "Failed to create temp dir in dropbox: [$http_code] $content" + exit 1 +else + echo "$http_code:" + jq <<< "$content" +fi + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + dropbox \ + --token "$DROPBOX_ACCESS_TOKEN" \ + --remote-url "dropbox://$DESTINATION_DROPBOX" \ + +# Simply check the number of files uploaded +expected_num_files=1 +num_files_in_dropbox=$(curl -X POST https://api.dropboxapi.com/2/files/list_folder \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ + --data "{\"path\":\"$DESTINATION_DROPBOX/example-docs/\"}" | jq '.entries | length') +if [ "$num_files_in_dropbox" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to dropbox, but found $num_files_in_dropbox files." + exit 1 +fi diff --git a/test_unstructured_ingest/test-ingest-dropbox.sh b/test_unstructured_ingest/test-ingest-dropbox.sh index fb9d09eb7f..697baebfcb 100755 --- a/test_unstructured_ingest/test-ingest-dropbox.sh +++ b/test_unstructured_ingest/test-ingest-dropbox.sh @@ -43,7 +43,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose \ --token "$DROPBOX_ACCESS_TOKEN" \ --recursive \ - --remote-url "dropbox:// /" \ + --remote-url "dropbox://test-input/" \ --work-dir "$WORK_DIR" diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index b2fa754b54..befd85beea 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -16,6 +16,7 @@ from .delta_table import get_base_dest_cmd as delta_table_dest_cmd from .delta_table import get_base_src_cmd as delta_table_base_src_cmd from .discord import get_base_src_cmd as discord_base_src_cmd +from .dropbox import get_base_dest_cmd as dropbox_base_dest_cmd from .dropbox import get_base_src_cmd as dropbox_base_src_cmd from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd from .fsspec import get_base_dest_cmd as fsspec_base_dest_cmd @@ -81,6 +82,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ azure_base_dest_cmd, box_base_dest_cmd, + dropbox_base_dest_cmd, fsspec_base_dest_cmd, s3_base_dest_cmd, azure_cognitive_search_base_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/dropbox.py b/unstructured/ingest/cli/cmds/dropbox.py index 13f21ca998..0df8b55bb3 100644 --- a/unstructured/ingest/cli/cmds/dropbox.py +++ b/unstructured/ingest/cli/cmds/dropbox.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "dropbox" + @dataclass class DropboxCliConfig(BaseConfig, CliMixin): @@ -27,5 +29,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="dropbox", cli_config=DropboxCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=DropboxCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=DropboxCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/connector/dropbox.py b/unstructured/ingest/connector/dropbox.py index 000e0709d0..edbb1d0b9b 100644 --- a/unstructured/ingest/connector/dropbox.py +++ b/unstructured/ingest/connector/dropbox.py @@ -135,6 +135,7 @@ def _list_files(self): ] +@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") @dataclass class DropboxDestinationConnector(FsspecDestinationConnector): connector_config: SimpleFsspecConfig diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index e15312e03b..6c0d79d5ae 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -132,6 +132,13 @@ def __post_init__(self): self.file_path = "" return + # dropbox paths can start with slash + match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url) + if match and self.protocol == "dropbox": + self.dir_path = match.group(1) + self.file_path = match.group(2) or "" + return + # just a path with no trailing prefix match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url) if match: From a01edf83cdb1a25fbee78cd1cbb9c75c284f52b9 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Oct 2023 11:03:37 -0400 Subject: [PATCH 07/38] WIP: adding gcs dest connector --- .../test-ingest-dropbox-dest.sh | 4 ++ .../test-ingest-gcs-dest.sh | 55 +++++++++++++++++++ test_unstructured_ingest/test-ingest-gcs.sh | 2 +- .../test-ingest-s3-dest.sh | 4 ++ unstructured/ingest/cli/cmds/__init__.py | 2 + unstructured/ingest/cli/cmds/gcs.py | 11 +++- unstructured/ingest/connector/gcs.py | 1 + 7 files changed, 77 insertions(+), 2 deletions(-) create mode 100755 test_unstructured_ingest/test-ingest-gcs-dest.sh diff --git a/test_unstructured_ingest/test-ingest-dropbox-dest.sh b/test_unstructured_ingest/test-ingest-dropbox-dest.sh index d8dac8598c..cb423aa591 100755 --- a/test_unstructured_ingest/test-ingest-dropbox-dest.sh +++ b/test_unstructured_ingest/test-ingest-dropbox-dest.sh @@ -9,6 +9,7 @@ OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} DESTINATION_DROPBOX="/test-output/$(date +%s)" +CI=${CI:-"false"} if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" @@ -25,6 +26,9 @@ source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi echo "deleting test folder $DESTINATION_DROPBOX" curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \ diff --git a/test_unstructured_ingest/test-ingest-gcs-dest.sh b/test_unstructured_ingest/test-ingest-gcs-dest.sh new file mode 100755 index 0000000000..247d813e30 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-gcs-dest.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=gcs-dest +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} +DESTINATION_GCS="gs://utic-ingest-test-fixtures-output/$(date +%s)/" +CI=${CI:-"false"} + +if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then + echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." + exit 0 +fi + +# Create temporary service key file +GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi +} + +trap cleanup EXIT + +PYTHONPATH=. ./unstructured/ingest/main.py \ + local \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --strategy fast \ + --verbose \ + --reprocess \ + --input-path example-docs/fake-memo.pdf \ + --work-dir "$WORK_DIR" \ + gcs \ + --token "$GCP_INGEST_SERVICE_KEY_FILE" \ + --remote-url "$DESTINATION_GCS" + +# Simply check the number of files uploaded +expected_num_files=1 +#num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l) +#if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then +# echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." +# exit 1 +#fi diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 4ce6cf227d..2efe7cc257 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -29,7 +29,7 @@ fi # Create temporary service key file GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) -echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" PYTHONPATH=. ./unstructured/ingest/main.py \ gcs \ diff --git a/test_unstructured_ingest/test-ingest-s3-dest.sh b/test_unstructured_ingest/test-ingest-s3-dest.sh index d2031f1291..ab6e8e2353 100755 --- a/test_unstructured_ingest/test-ingest-s3-dest.sh +++ b/test_unstructured_ingest/test-ingest-s3-dest.sh @@ -9,12 +9,16 @@ OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} DESTINATION_S3="s3://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(date +%s)/" +CI=${CI:-"false"} # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" cleanup_dir "$WORK_DIR" + if [ "$CI" == "true" ]; then + cleanup_dir "$DOWNLOAD_DIR" + fi if aws s3 ls "$DESTINATION_S3" --region us-east-2; then echo "deleting destination s3 location: $DESTINATION_S3" diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index befd85beea..7a75a5666b 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -21,6 +21,7 @@ from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd from .fsspec import get_base_dest_cmd as fsspec_base_dest_cmd from .fsspec import get_base_src_cmd as fsspec_base_src_cmd +from .gcs import get_base_dest_cmd as gcs_base_dest_cmd from .gcs import get_base_src_cmd as gcs_base_src_cmd from .github import get_base_src_cmd as github_base_src_cmd from .gitlab import get_base_src_cmd as gitlab_base_src_cmd @@ -84,6 +85,7 @@ box_base_dest_cmd, dropbox_base_dest_cmd, fsspec_base_dest_cmd, + gcs_base_dest_cmd, s3_base_dest_cmd, azure_cognitive_search_base_dest_cmd, delta_table_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/gcs.py b/unstructured/ingest/cli/cmds/gcs.py index ccec32491b..cb6ea80436 100644 --- a/unstructured/ingest/cli/cmds/gcs.py +++ b/unstructured/ingest/cli/cmds/gcs.py @@ -9,6 +9,8 @@ ) from unstructured.ingest.interfaces import BaseConfig +CMD_NAME = "gcs" + @dataclass class GcsCliConfig(BaseConfig, CliMixin): @@ -29,5 +31,12 @@ def get_cli_options() -> t.List[click.Option]: def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name="gcs", cli_config=GcsCliConfig, is_fsspec=True) + cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, cli_config=GcsCliConfig, is_fsspec=True) + return cmd_cls + + +def get_base_dest_cmd(): + from unstructured.ingest.cli.base.dest import BaseDestCmd + + cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, cli_config=GcsCliConfig, is_fsspec=True) return cmd_cls diff --git a/unstructured/ingest/connector/gcs.py b/unstructured/ingest/connector/gcs.py index 1a75fef2ea..992bb42c5a 100644 --- a/unstructured/ingest/connector/gcs.py +++ b/unstructured/ingest/connector/gcs.py @@ -35,6 +35,7 @@ def __post_init__(self): self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc +@requires_dependencies(["gcsfs", "fsspec"], extras="gcs") @dataclass class GcsDestinationConnector(FsspecDestinationConnector): connector_config: SimpleGcsConfig From bbbe539de45c766d23b78bf76ddf8a904669c112 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Oct 2023 15:09:36 -0400 Subject: [PATCH 08/38] finish setting up e2e test for gcs --- .../test-ingest-gcs-dest.sh | 18 ++++++++++++------ unstructured/ingest/connector/fsspec.py | 12 ++++++++---- unstructured/ingest/connector/local.py | 2 +- unstructured/ingest/interfaces.py | 9 +++++++++ 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/test_unstructured_ingest/test-ingest-gcs-dest.sh b/test_unstructured_ingest/test-ingest-gcs-dest.sh index 247d813e30..2437b68051 100755 --- a/test_unstructured_ingest/test-ingest-gcs-dest.sh +++ b/test_unstructured_ingest/test-ingest-gcs-dest.sh @@ -8,7 +8,7 @@ OUTPUT_FOLDER_NAME=gcs-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_GCS="gs://utic-ingest-test-fixtures-output/$(date +%s)/" +DESTINATION_GCS="gs://utic-test-ingest-fixtures-output/$(date +%s)" CI=${CI:-"false"} if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then @@ -28,6 +28,12 @@ function cleanup() { if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi + + if gcloud storage ls "$DESTINATION_GCS"; then + echo "deleting $DESTINATION_GCS" + gcloud storage rm --recursive "$DESTINATION_GCS" + fi + } trap cleanup EXIT @@ -48,8 +54,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ # Simply check the number of files uploaded expected_num_files=1 -#num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l) -#if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then -# echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." -# exit 1 -#fi +num_files_in_gcs=$(gcloud storage ls "$DESTINATION_GCS"/example-docs/ | wc -l ) +if [ "$num_files_in_gcs" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to gcs, but found $num_files_in_gcs files." + exit 1 +fi diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 2ffbebca55..1b3cf5bfbc 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -256,9 +256,13 @@ def write_dict( logger.info(f"Writing content using filesystem: {type(fs).__name__}") - dest_folder = self.connector_config.path_without_protocol - dest_output_path = str(PurePath(dest_folder, filename)) if filename else dest_folder - full_dest_path = f"{self.connector_config.protocol}://{dest_output_path}" + output_folder = self.connector_config.path_without_protocol + output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator + filename = ( + filename.strip(os.sep) if filename else filename + ) # Make sure filename doesn't begin with file seperator + output_path = str(PurePath(output_folder, filename)) if filename else output_folder + full_dest_path = f"{self.connector_config.protocol}://{output_path}" logger.debug(f"uploading content to {full_dest_path}") fs.write_text( full_dest_path, @@ -269,7 +273,7 @@ def write_dict( def write(self, docs: t.List[BaseIngestDoc]) -> None: for doc in docs: - file_path = doc.base_filename + file_path = doc.base_output_filename filename = file_path if file_path else None with open(doc._output_filename) as json_file: logger.debug(f"uploading content from {doc._output_filename}") diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index f58d379ae9..7c753acb5d 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -78,7 +78,7 @@ def _output_filename(self) -> Path: """ input_path = Path(self.connector_config.input_path) basename = ( - f"{Path(self.path).name}.json" + f"{self.base_filename}.json" if input_path.is_file() else f"{Path(self.path).relative_to(input_path)}.json" ) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 6c0d79d5ae..382962ef3c 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -341,6 +341,15 @@ def base_filename(self) -> t.Optional[str]: return base_path return None + @property + def base_output_filename(self) -> t.Optional[str]: + if self.processor_config.output_dir and self._output_filename: + output_path = str(Path(self.processor_config.output_dir).resolve()) + full_path = str(self._output_filename) + base_path = full_path.replace(output_path, "") + return base_path + return None + @property @abstractmethod def _output_filename(self): From 85c5cb8582b295d86495f321b65a5b917be76a15 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Oct 2023 15:13:00 -0400 Subject: [PATCH 09/38] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30cce43b31..550f85e1c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,11 @@ ### Enhancements * **Add CI evaluation workflow** Adds evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. +* **Fsspec downstream connectors** New destination connector added to ingest CLI, users may now use `unstructured-ingest` to write to any of the following: + * Azure + * Box + * Dropbox + * Google Cloud Service ### Features From 50bc26334d209b2045cf771826c420eb02100945 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 10:26:07 -0400 Subject: [PATCH 10/38] Add cloud login for az and gcloud in CI --- .github/workflows/ci.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5678813004..83a24ebc94 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -301,6 +301,20 @@ jobs: run: | source .venv/bin/activate PYTHONPATH=. pytest test_unstructured_ingest/unit + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Test (end-to-end) env: AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} From 20a207c25f276a156e370203c494c5e7be1c35ce Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 10:30:23 -0400 Subject: [PATCH 11/38] Add dest tests to ingest script --- test_unstructured_ingest/test-ingest.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 1a9379f173..423eff8e04 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -9,6 +9,11 @@ cd "$SCRIPT_DIR"/.. || exit 1 export OMP_THREAD_LIMIT=1 all_tests=( +'test-ingest-azure-dest.sh' +'test-ingest-box-dest.sh' +'test-ingest-dropbox-dest.sh' +'test-ingest-gcs-dest.sh' +'test-ingest-s3-dest.sh' 'test-ingest-s3.sh' 'test-ingest-s3-minio.sh' 'test-ingest-azure.sh' From 88a6d38177c48b293f9f2b1e1b4a2f43343405e0 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 12:54:56 -0400 Subject: [PATCH 12/38] Add permissions to CI --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 83a24ebc94..cf51ae1d41 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,9 @@ env: jobs: setup: + permissions: + contents: 'read' + id-token: 'write' strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] From a1e743a1c0ab65473ae064864358cf8fb98838c2 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 13:51:39 -0400 Subject: [PATCH 13/38] Debugging CI --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cf51ae1d41..1395d574d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,9 +13,6 @@ env: jobs: setup: - permissions: - contents: 'read' - id-token: 'write' strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] @@ -271,6 +268,9 @@ jobs: test_ingest: + permissions: + contents: 'read' + id-token: 'write' strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] From e17cc43ba956eb0fbae13c782a348c24f8ad320c Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 15:01:11 -0400 Subject: [PATCH 14/38] Add generic kwargs input for all writers --- unstructured/ingest/runner/writers/azure.py | 1 + unstructured/ingest/runner/writers/box.py | 1 + unstructured/ingest/runner/writers/gcs.py | 1 + 3 files changed, 3 insertions(+) diff --git a/unstructured/ingest/runner/writers/azure.py b/unstructured/ingest/runner/writers/azure.py index a0ba68ce7f..306825eb2f 100644 --- a/unstructured/ingest/runner/writers/azure.py +++ b/unstructured/ingest/runner/writers/azure.py @@ -10,6 +10,7 @@ def azure_writer( connection_string: t.Optional[str] = None, overwrite: bool = False, verbose: bool = False, + **kwargs, ) -> BaseDestinationConnector: from unstructured.ingest.connector.azure import ( AzureBlobStorageDestinationConnector, diff --git a/unstructured/ingest/runner/writers/box.py b/unstructured/ingest/runner/writers/box.py index f23e273b9f..8dfb0bf901 100644 --- a/unstructured/ingest/runner/writers/box.py +++ b/unstructured/ingest/runner/writers/box.py @@ -9,6 +9,7 @@ def box_writer( remote_url: str, box_app_config: t.Optional[str], verbose: bool = False, + **kwargs, ) -> BaseDestinationConnector: import boxsdk diff --git a/unstructured/ingest/runner/writers/gcs.py b/unstructured/ingest/runner/writers/gcs.py index c221fe959f..3f0000d26a 100644 --- a/unstructured/ingest/runner/writers/gcs.py +++ b/unstructured/ingest/runner/writers/gcs.py @@ -7,6 +7,7 @@ def gcs_writer( remote_url: str, token: t.Optional[str], verbose: bool = False, + **kwargs, ) -> BaseDestinationConnector: from unstructured.ingest.connector.fsspec import FsspecWriteConfig from unstructured.ingest.connector.gcs import ( From c36a734c3c5aa3a3ce865016886960e0de436424 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 15:57:04 -0400 Subject: [PATCH 15/38] debugging CI --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1395d574d2..5aebbd44b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -304,6 +304,8 @@ jobs: run: | source .venv/bin/activate PYTHONPATH=. pytest test_unstructured_ingest/unit + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' - name: 'Google Cloud Auth' uses: 'google-github-actions/auth@v1' with: From 84dfbdb51b7d43e9d73d223ac1be508c6aee36d3 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 16:22:50 -0400 Subject: [PATCH 16/38] debugging CI --- .github/workflows/ci.yml | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5aebbd44b1..db112a84c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -279,7 +279,22 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest, lint] steps: - - uses: actions/checkout@v3 + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + create_credentials_file: 'true' + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -304,22 +319,6 @@ jobs: run: | source .venv/bin/activate PYTHONPATH=. pytest test_unstructured_ingest/unit - # actions/checkout MUST come before auth - - uses: 'actions/checkout@v4' - - name: 'Google Cloud Auth' - uses: 'google-github-actions/auth@v1' - with: - create_credentials_file: 'true' - workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} - service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - name: 'Az CLI login' - uses: azure/login@v1 - with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Test (end-to-end) env: AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} From 279c4bd5d29265f42568f2aeed6117b509c408d2 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 24 Oct 2023 16:57:22 -0400 Subject: [PATCH 17/38] debugging CI --- .github/workflows/ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db112a84c8..137addfcdf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -284,7 +284,6 @@ jobs: - name: 'Google Cloud Auth' uses: 'google-github-actions/auth@v1' with: - create_credentials_file: 'true' workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - name: 'Set up Cloud SDK' From 00d4aa4c0d34737caef395601274ed416e6ac181 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 10:01:38 -0400 Subject: [PATCH 18/38] Add cloud auth to upgest ingest job --- .../workflows/ingest-test-fixtures-update-pr.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 913ee0db18..d88f1ad140 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -80,7 +80,21 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: - - uses: actions/checkout@v3 + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - uses: actions/cache/restore@v3 id: virtualenv-cache with: From a0e4c8c11d0d14e903719a46b9b2c70ccf91d345 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 10:10:28 -0400 Subject: [PATCH 19/38] Add permissions to upgest ingest job --- .github/workflows/ingest-test-fixtures-update-pr.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index d88f1ad140..73f4aa5076 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -75,6 +75,9 @@ jobs: make install-all-ingest update-fixtures-and-pr: + permissions: + contents: 'read' + id-token: 'write' runs-on: ubuntu-latest-m env: NLTK_DATA: ${{ github.workspace }}/nltk_data From 275062f2e7e797a0168a311fa6f2872fee0c626a Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 10:39:11 -0400 Subject: [PATCH 20/38] debugging CI --- .github/workflows/ingest-test-fixtures-update-pr.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 73f4aa5076..ff48f25238 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -98,6 +98,10 @@ jobs: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: Azure CLI script + uses: azure/CLI@v1 + with: + azcliversion: latest - uses: actions/cache/restore@v3 id: virtualenv-cache with: From 0cf5fb22235d6ebab44429e1503c62a065ae7552 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 10:52:45 -0400 Subject: [PATCH 21/38] debugging CI --- .../ingest-test-fixtures-update-pr.yml | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index ff48f25238..e6312c6c68 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -98,10 +98,6 @@ jobs: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - name: Azure CLI script - uses: azure/CLI@v1 - with: - azcliversion: latest - uses: actions/cache/restore@v3 id: virtualenv-cache with: @@ -160,15 +156,18 @@ jobs: OCR_AGENT: "tesseract" OVERWRITE_FIXTURES: "true" CI: "true" - run: | - source .venv/bin/activate - sudo apt-get update - sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get install -y tesseract-ocr - sudo apt-get install -y tesseract-ocr-kor - tesseract --version - ./test_unstructured_ingest/test-ingest.sh + uses: azure/CLI@v1 + with: + azcliversion: latest + inlineScript: | + source .venv/bin/activate + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get install -y tesseract-ocr + sudo apt-get install -y tesseract-ocr-kor + tesseract --version + ./test_unstructured_ingest/test-ingest.sh - name: Save branch name to environment file id: branch From e54c1b9fedcd6b5ef35b7065e1a9866be512069d Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 11:14:36 -0400 Subject: [PATCH 22/38] debugging CI --- .../workflows/ingest-test-fixtures-update-pr.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index e6312c6c68..ede3b43884 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -83,7 +83,12 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: - # actions/checkout MUST come before auth + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - uses: 'actions/checkout@v4' - name: 'Google Cloud Auth' uses: 'google-github-actions/auth@v1' @@ -92,12 +97,6 @@ jobs: service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v1' - - name: 'Az CLI login' - uses: azure/login@v1 - with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - uses: actions/cache/restore@v3 id: virtualenv-cache with: From ef659fd6554238a378f822f9821c8d7d65758592 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 25 Oct 2023 11:25:57 -0400 Subject: [PATCH 23/38] debugging CI --- .github/workflows/ingest-test-fixtures-update-pr.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index ede3b43884..57693b466b 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -83,6 +83,14 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: + - name: Install Azure cli + run: | + sudo apt-get install ca-certificates curl apt-transport-https lsb-release gnupg + curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null + AZ_REPO=$(lsb_release -cs) + echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ $AZ_REPO main" | sudo tee /etc/apt/sources.list.d/azure-cli.list + sudo apt-get update + sudo apt-get install azure-cli - name: 'Az CLI login' uses: azure/login@v1 with: From e5bf53334797fb333a6a96e566cb37a734cb4c40 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 08:41:18 -0700 Subject: [PATCH 24/38] move permissions to top level --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 137addfcdf..8991e539fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,10 @@ on: env: GHA_CACHE_KEY_VERSION: "v1" +permissions: + id-token: write + contents: read + jobs: setup: strategy: @@ -268,9 +272,6 @@ jobs: test_ingest: - permissions: - contents: 'read' - id-token: 'write' strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] From de47529e37022dda61e45847cbe67014b7c87201 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 08:41:57 -0700 Subject: [PATCH 25/38] move permissions to top level --- .github/workflows/ingest-test-fixtures-update-pr.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 57693b466b..e01324996a 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -8,6 +8,10 @@ env: GHA_CACHE_KEY_VERSION: "v1" PYTHON_VERSION: "3.10" +permissions: + id-token: write + contents: read + jobs: setup: runs-on: ubuntu-latest @@ -75,9 +79,6 @@ jobs: make install-all-ingest update-fixtures-and-pr: - permissions: - contents: 'read' - id-token: 'write' runs-on: ubuntu-latest-m env: NLTK_DATA: ${{ github.workspace }}/nltk_data From 718b789de3faeafb08b9e4c41a498924819e0829 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 08:47:37 -0700 Subject: [PATCH 26/38] bump version --- CHANGELOG.md | 14 +++++++------- unstructured/__version__.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 550f85e1c7..450276e9e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.28-dev4 +## 0.10.28-dev5 ### Enhancements @@ -6,6 +6,11 @@ * **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure. * **Use `yolox` by default for table extraction when partitioning pdf/image** `yolox` model provides higher recall of the table regions than the quantized version and it is now the default element detection model when `infer_table_structure=True` for partitioning pdf/image files * **Remove pdfminer elements from inside tables** Previously, when using `hi_res` some elements where extracted using pdfminer too, so we removed pdfminer from the tables pipeline to avoid duplicated elements. +* **Fsspec downstream connectors** New destination connector added to ingest CLI, users may now use `unstructured-ingest` to write to any of the following: + * Azure + * Box + * Dropbox + * Google Cloud Service ### Features @@ -35,11 +40,6 @@ ### Enhancements * **Add CI evaluation workflow** Adds evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. -* **Fsspec downstream connectors** New destination connector added to ingest CLI, users may now use `unstructured-ingest` to write to any of the following: - * Azure - * Box - * Dropbox - * Google Cloud Service ### Features @@ -1614,4 +1614,4 @@ of an email. ## 0.2.0 -* Initial release of unstructured +* Initial release of unstructured \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 606e14436c..e24bf1b0e8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.28-dev4" # pragma: no cover +__version__ = "0.10.28-dev5" # pragma: no cover From d4742e603b73651f94189ec1a439e0145f0919b5 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:26:11 -0700 Subject: [PATCH 27/38] activate gcp credentials --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8991e539fb..788a1c0ac0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -287,6 +287,8 @@ jobs: with: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + create_credentials_file: true + activate_credentials_file: true - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v1' - name: 'Az CLI login' From 80aae6614693b68593a201af7dfa1ca02175e4ad Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:27:44 -0700 Subject: [PATCH 28/38] test logins --- .github/workflows/ci.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 788a1c0ac0..9f85b55e0a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,6 +16,25 @@ permissions: contents: read jobs: + test_logins: + runs-on: ubuntu-latest + steps: + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + create_credentials_file: true + activate_credentials_file: true + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} setup: strategy: matrix: From 71ccbc16ac84a58458899b215699f5d86652aeba Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:30:52 -0700 Subject: [PATCH 29/38] add login command --- .github/workflows/ci.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f85b55e0a..9adbb81a77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,9 +26,12 @@ jobs: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} create_credentials_file: true - activate_credentials_file: true - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v1' + - id: 'gcloud' + name: 'gcloud' + run: |- + gcloud auth login --brief --cred-file="${{ steps.auth.outputs.credentials_file_path }}" - name: 'Az CLI login' uses: azure/login@v1 with: From 8c366a17354dc3eaae2f87e95a23b88d8e3a9157 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:33:25 -0700 Subject: [PATCH 30/38] update path to credentials file --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9adbb81a77..874a1e6a66 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,7 @@ jobs: - uses: 'actions/checkout@v4' - name: 'Google Cloud Auth' uses: 'google-github-actions/auth@v1' + id: gauth with: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} @@ -31,7 +32,7 @@ jobs: - id: 'gcloud' name: 'gcloud' run: |- - gcloud auth login --brief --cred-file="${{ steps.auth.outputs.credentials_file_path }}" + gcloud auth login --brief --cred-file="${{ steps.gauth.outputs.credentials_file_path }}" - name: 'Az CLI login' uses: azure/login@v1 with: From d17a588890b390e4b281bbd0ce83ec40a892f3ba Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:37:14 -0700 Subject: [PATCH 31/38] add test commnds --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 874a1e6a66..ba1896a85f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,12 +33,18 @@ jobs: name: 'gcloud' run: |- gcloud auth login --brief --cred-file="${{ steps.gauth.outputs.credentials_file_path }}" + - name: 'run gcloud command' + run: |- + gcloud projects list - name: 'Az CLI login' uses: azure/login@v1 with: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: 'azure test command' + run: |- + az account show setup: strategy: matrix: From e3c2df77d54a091260ce1104ee045f365e6af4b4 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 09:58:33 -0700 Subject: [PATCH 32/38] remove extra steps for gcloud --- .github/workflows/ci.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba1896a85f..a67ec9d57e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,13 +26,6 @@ jobs: with: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - create_credentials_file: true - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v1' - - id: 'gcloud' - name: 'gcloud' - run: |- - gcloud auth login --brief --cred-file="${{ steps.gauth.outputs.credentials_file_path }}" - name: 'run gcloud command' run: |- gcloud projects list From f77322318d867abe30bd08bed086388b0f0443d7 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 25 Oct 2023 10:04:08 -0700 Subject: [PATCH 33/38] add back setup cloud sdk --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a67ec9d57e..38160bbdc6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,8 @@ jobs: with: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' - name: 'run gcloud command' run: |- gcloud projects list From 37edcd3d570e9581df949a2d856a2f1e192871e0 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 26 Oct 2023 09:50:44 -0400 Subject: [PATCH 34/38] debugging CI --- .../ingest-test-fixtures-update-pr.yml | 75 ++++++++++++------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index e01324996a..1dfd48c40e 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -13,6 +13,30 @@ permissions: contents: read jobs: + test_logins: + runs-on: ubuntu-latest + steps: + - uses: 'actions/checkout@v4' + - name: 'Google Cloud Auth' + uses: 'google-github-actions/auth@v1' + id: gauth + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v1' + - name: 'run gcloud command' + run: |- + gcloud projects list + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: 'azure test command' + run: |- + az account show setup: runs-on: ubuntu-latest if: | @@ -84,28 +108,30 @@ jobs: NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup_ingest] steps: - - name: Install Azure cli - run: | - sudo apt-get install ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - AZ_REPO=$(lsb_release -cs) - echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ $AZ_REPO main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt-get update - sudo apt-get install azure-cli - - name: 'Az CLI login' - uses: azure/login@v1 - with: - client-id: ${{ secrets.AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # actions/checkout MUST come before auth - uses: 'actions/checkout@v4' - name: 'Google Cloud Auth' uses: 'google-github-actions/auth@v1' with: workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} + create_credentials_file: true + activate_credentials_file: true - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v1' + - name: 'Az CLI login' + uses: azure/login@v1 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + - name: Get full Python version + id: full-python-version + run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - uses: actions/cache/restore@v3 id: virtualenv-cache with: @@ -164,18 +190,15 @@ jobs: OCR_AGENT: "tesseract" OVERWRITE_FIXTURES: "true" CI: "true" - uses: azure/CLI@v1 - with: - azcliversion: latest - inlineScript: | - source .venv/bin/activate - sudo apt-get update - sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get install -y tesseract-ocr - sudo apt-get install -y tesseract-ocr-kor - tesseract --version - ./test_unstructured_ingest/test-ingest.sh + run: | + source .venv/bin/activate + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get install -y tesseract-ocr + sudo apt-get install -y tesseract-ocr-kor + tesseract --version + ./test_unstructured_ingest/test-ingest.sh - name: Save branch name to environment file id: branch From 9deb94a601a88b379b213216237d729719124007 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Thu, 26 Oct 2023 09:37:34 -0700 Subject: [PATCH 35/38] set environment for azure federated login --- .github/workflows/ci.yml | 1 + .github/workflows/ingest-test-fixtures-update-pr.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 38160bbdc6..effb786452 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -296,6 +296,7 @@ jobs: test_ingest: + environment: ci strategy: matrix: python-version: ["3.8","3.9","3.10","3.11"] diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 1dfd48c40e..96ba805b36 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -15,6 +15,7 @@ permissions: jobs: test_logins: runs-on: ubuntu-latest + environment: ci steps: - uses: 'actions/checkout@v4' - name: 'Google Cloud Auth' @@ -103,6 +104,7 @@ jobs: make install-all-ingest update-fixtures-and-pr: + environment: ci runs-on: ubuntu-latest-m env: NLTK_DATA: ${{ github.workspace }}/nltk_data From 482ccc0cd7f3455c51411f98ac1f80d7a73e8814 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 26 Oct 2023 15:06:28 -0400 Subject: [PATCH 36/38] Fix s3 dest test --- test_unstructured_ingest/test-ingest-s3-dest.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_unstructured_ingest/test-ingest-s3-dest.sh b/test_unstructured_ingest/test-ingest-s3-dest.sh index ab6e8e2353..1d1db411eb 100755 --- a/test_unstructured_ingest/test-ingest-s3-dest.sh +++ b/test_unstructured_ingest/test-ingest-s3-dest.sh @@ -44,8 +44,10 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ # Simply check the number of files uploaded expected_num_files=1 -num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l) +num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}example-docs/" --region us-east-2 | grep "\.json$" | wc -l) if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." exit 1 +else + echo "Expected number of files found: $num_files_in_s3/$expected_num_files" fi From 3a350c8b3bac7b51ba42f3629d70db3e6651239a Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 26 Oct 2023 16:01:30 -0400 Subject: [PATCH 37/38] fix shellcheck --- test_unstructured_ingest/test-ingest-s3-dest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_ingest/test-ingest-s3-dest.sh b/test_unstructured_ingest/test-ingest-s3-dest.sh index 1d1db411eb..3160a0d917 100755 --- a/test_unstructured_ingest/test-ingest-s3-dest.sh +++ b/test_unstructured_ingest/test-ingest-s3-dest.sh @@ -44,7 +44,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ # Simply check the number of files uploaded expected_num_files=1 -num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}example-docs/" --region us-east-2 | grep "\.json$" | wc -l) +num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}example-docs/" --region us-east-2 | grep -c "\.json$") if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." exit 1 From 463f4c3ea12b6f141d3b067c74a86b62b1309aae Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Mon, 30 Oct 2023 08:53:00 -0700 Subject: [PATCH 38/38] expand fsspec downstream connectors <- Ingest test fixtures update (#1938) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rbiseck3 --- .../dropbox/{test-input => }/handbook-1p.docx.json | 0 .../dropbox/{test-input => }/nested-1/ideas-page.html.json | 0 .../dropbox/{test-input => }/nested-2/ideas-page.html.json | 0 .../dropbox/{test-input => }/science-exploration-1p.pptx.json | 0 .../{ => example-docs}/fake-html-cp1252.html.json | 0 .../language-docs}/UDHR_first_article_all.txt.json | 0 test_unstructured_ingest/metrics/aggregate-scores-cct.tsv | 4 ++-- test_unstructured_ingest/metrics/all-docs-cct.tsv | 2 +- 8 files changed, 3 insertions(+), 3 deletions(-) rename test_unstructured_ingest/expected-structured-output/dropbox/{test-input => }/handbook-1p.docx.json (100%) rename test_unstructured_ingest/expected-structured-output/dropbox/{test-input => }/nested-1/ideas-page.html.json (100%) rename test_unstructured_ingest/expected-structured-output/dropbox/{test-input => }/nested-2/ideas-page.html.json (100%) rename test_unstructured_ingest/expected-structured-output/dropbox/{test-input => }/science-exploration-1p.pptx.json (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/{ => example-docs}/fake-html-cp1252.html.json (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file/{ => example-docs/language-docs}/UDHR_first_article_all.txt.json (100%) diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/handbook-1p.docx.json rename to test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-1/ideas-page.html.json rename to test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/nested-2/ideas-page.html.json rename to test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/test-input/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/dropbox/test-input/science-exploration-1p.pptx.json rename to test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/example-docs/fake-html-cp1252.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/example-docs/fake-html-cp1252.html.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json rename to test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/language-docs/UDHR_first_article_all.txt.json diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv index 2553a0f90e..0054e9a7e4 100644 --- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ strategy average sample_sd population_sd count -cct-accuracy 0.777 0.088 0.072 3 -cct-%missing 0.087 0.045 0.037 3 +cct-accuracy 0.798 0.083 0.072 4 +cct-%missing 0.087 0.037 0.032 4 diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv index f9be87f7ab..15a1d1d045 100644 --- a/test_unstructured_ingest/metrics/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv @@ -1,4 +1,4 @@ filename connector cct-accuracy cct-%missing science-exploration-1p.pptx box 0.861 0.09 example-10k.html local 0.686 0.04 -IRS-form-1987.pdf azure 0.783 0.13 +IRS-form-1987.pdf azure 0.783 0.13 \ No newline at end of file