From 0f2a5318e2879821fb0ae83965bf7c7784e374cf Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Mon, 1 Jul 2024 09:50:39 -0400 Subject: [PATCH 1/3] Fix dependencies and make entrire registried accessible from import connectors package --- unstructured/ingest/v2/cli/base/dest.py | 2 +- unstructured/ingest/v2/cli/base/src.py | 2 +- unstructured/ingest/v2/cli/cmds/chroma.py | 2 +- .../ingest/v2/cli/configs/partition.py | 2 +- unstructured/ingest/v2/cli/utils.py | 59 +++++++++++++++++++ .../v2/processes/connectors/__init__.py | 45 ++++++++++++++ .../ingest/v2/processes/connectors/astra.py | 16 ++--- .../ingest/v2/processes/connectors/chroma.py | 16 ++--- .../v2/processes/connectors/elasticsearch.py | 32 ++++------ .../processes/connectors/fsspec/__init__.py | 36 +++++++++++ .../v2/processes/connectors/fsspec/azure.py | 28 ++++----- .../v2/processes/connectors/fsspec/box.py | 28 ++++----- .../v2/processes/connectors/fsspec/dropbox.py | 28 ++++----- .../v2/processes/connectors/fsspec/gcs.py | 28 ++++----- .../v2/processes/connectors/fsspec/s3.py | 28 ++++----- .../v2/processes/connectors/fsspec/sftp.py | 28 ++++----- .../v2/processes/connectors/google_drive.py | 16 ++--- .../ingest/v2/processes/connectors/local.py | 22 +++---- .../v2/processes/connectors/onedrive.py | 16 ++--- .../v2/processes/connectors/opensearch.py | 34 +++++------ .../v2/processes/connectors/weaviate.py | 16 ++--- 21 files changed, 268 insertions(+), 216 deletions(-) diff --git a/unstructured/ingest/v2/cli/base/dest.py b/unstructured/ingest/v2/cli/base/dest.py index 1b27a2b4c7..3b8b8a5caf 100644 --- a/unstructured/ingest/v2/cli/base/dest.py +++ b/unstructured/ingest/v2/cli/base/dest.py @@ -4,9 +4,9 @@ import click -from unstructured.ingest.cli.utils import conform_click_options from unstructured.ingest.v2.cli.base.cmd import BaseCmd from unstructured.ingest.v2.cli.interfaces import CliConfig +from unstructured.ingest.v2.cli.utils import conform_click_options from unstructured.ingest.v2.logger import logger diff --git a/unstructured/ingest/v2/cli/base/src.py b/unstructured/ingest/v2/cli/base/src.py index 0674f187ff..f35d5b8e69 100644 --- a/unstructured/ingest/v2/cli/base/src.py +++ b/unstructured/ingest/v2/cli/base/src.py @@ -4,7 +4,6 @@ import click -from unstructured.ingest.cli.utils import Group, conform_click_options from unstructured.ingest.v2.cli.base.cmd import BaseCmd from unstructured.ingest.v2.cli.configs import ( ChunkerCliConfig, @@ -13,6 +12,7 @@ ProcessorCliConfig, ) from unstructured.ingest.v2.cli.interfaces import CliConfig +from unstructured.ingest.v2.cli.utils import Group, conform_click_options from unstructured.ingest.v2.logger import logger diff --git a/unstructured/ingest/v2/cli/cmds/chroma.py b/unstructured/ingest/v2/cli/cmds/chroma.py index f661ec56a3..c138163515 100644 --- a/unstructured/ingest/v2/cli/cmds/chroma.py +++ b/unstructured/ingest/v2/cli/cmds/chroma.py @@ -2,9 +2,9 @@ import click -from unstructured.ingest.cli.interfaces import Dict from unstructured.ingest.v2.cli.base import DestCmd from unstructured.ingest.v2.cli.interfaces import CliConfig +from unstructured.ingest.v2.cli.utils import Dict from unstructured.ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE diff --git a/unstructured/ingest/v2/cli/configs/partition.py b/unstructured/ingest/v2/cli/configs/partition.py index ae21206357..5ec5c0dbe5 100644 --- a/unstructured/ingest/v2/cli/configs/partition.py +++ b/unstructured/ingest/v2/cli/configs/partition.py @@ -2,8 +2,8 @@ import click -from unstructured.ingest.cli.interfaces import DelimitedString, Dict from unstructured.ingest.v2.cli.interfaces import CliConfig +from unstructured.ingest.v2.cli.utils import DelimitedString, Dict @dataclass diff --git a/unstructured/ingest/v2/cli/utils.py b/unstructured/ingest/v2/cli/utils.py index 15b9f14202..a061d24854 100644 --- a/unstructured/ingest/v2/cli/utils.py +++ b/unstructured/ingest/v2/cli/utils.py @@ -3,6 +3,7 @@ import sys from dataclasses import fields, is_dataclass from gettext import gettext, ngettext +from gettext import gettext as _ from pathlib import Path from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin @@ -12,6 +13,13 @@ from unstructured.ingest.v2.logger import logger +def conform_click_options(options: dict): + # Click sets all multiple fields as tuple, this needs to be updated to list + for k, v in options.items(): + if isinstance(v, tuple): + options[k] = list(v) + + class Dict(click.ParamType): name = "dict" @@ -179,3 +187,54 @@ def is_subclass(instance, class_type) -> bool: adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config) return config.from_dict(adjusted_dict, apply_name_overload=False) + + +class Group(click.Group): + def parse_args(self, ctx, args): + """ + This allows for subcommands to be called with the --help flag without breaking + if parent command is missing any of its required parameters + """ + + try: + return super().parse_args(ctx, args) + except click.MissingParameter: + if "--help" not in args: + raise + + # remove the required params so that help can display + for param in self.params: + param.required = False + return super().parse_args(ctx, args) + + def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: + """ + Copy of the original click.Group format_commands() method but replacing + 'Commands' -> 'Destinations' + """ + commands = [] + for subcommand in self.list_commands(ctx): + cmd = self.get_command(ctx, subcommand) + # What is this, the tool lied about a command. Ignore it + if cmd is None: + continue + if cmd.hidden: + continue + + commands.append((subcommand, cmd)) + + # allow for 3 times the default spacing + if len(commands): + if formatter.width: + limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands) + else: + limit = -6 - max(len(cmd[0]) for cmd in commands) + + rows = [] + for subcommand, cmd in commands: + help = cmd.get_short_help_str(limit) + rows.append((subcommand, help)) + + if rows: + with formatter.section(_("Destinations")): + formatter.write_dl(rows) diff --git a/unstructured/ingest/v2/processes/connectors/__init__.py b/unstructured/ingest/v2/processes/connectors/__init__.py index 9d48db4f9f..43683ce9d9 100644 --- a/unstructured/ingest/v2/processes/connectors/__init__.py +++ b/unstructured/ingest/v2/processes/connectors/__init__.py @@ -1 +1,46 @@ from __future__ import annotations + +from unstructured.ingest.v2.processes.connector_registry import ( + add_destination_entry, + add_source_entry, +) + +from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE +from .astra import astra_destination_entry +from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE +from .chroma import chroma_destination_entry +from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE +from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry +from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE +from .google_drive import google_drive_source_entry +from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE +from .local import local_destination_entry, local_source_entry +from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE +from .onedrive import onedrive_source_entry +from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE +from .opensearch import opensearch_destination_entry, opensearch_source_entry +from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE +from .weaviate import weaviate_destination_entry + +add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry) + +add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry) + +add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry) +add_destination_entry( + destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry +) + +add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry) + +add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry) +add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry) + +add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry) + +add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry) +add_destination_entry( + destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry +) + +add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry) diff --git a/unstructured/ingest/v2/processes/connectors/astra.py b/unstructured/ingest/v2/processes/connectors/astra.py index 33207207d9..e6ae5c5777 100644 --- a/unstructured/ingest/v2/processes/connectors/astra.py +++ b/unstructured/ingest/v2/processes/connectors/astra.py @@ -20,7 +20,6 @@ from unstructured.ingest.v2.logger import logger from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, - add_destination_entry, ) from unstructured.utils import requires_dependencies @@ -143,13 +142,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: collection.insert_many(chunk) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=AstraConnectionConfig, - upload_stager_config=AstraUploadStagerConfig, - upload_stager=AstraUploadStager, - uploader_config=AstraUploaderConfig, - uploader=AstraUploader, - ), +astra_destination_entry = DestinationRegistryEntry( + connection_config=AstraConnectionConfig, + upload_stager_config=AstraUploadStagerConfig, + upload_stager=AstraUploadStager, + uploader_config=AstraUploaderConfig, + uploader=AstraUploader, ) diff --git a/unstructured/ingest/v2/processes/connectors/chroma.py b/unstructured/ingest/v2/processes/connectors/chroma.py index 3cd493dd0b..d4891b023b 100644 --- a/unstructured/ingest/v2/processes/connectors/chroma.py +++ b/unstructured/ingest/v2/processes/connectors/chroma.py @@ -23,7 +23,6 @@ from unstructured.ingest.v2.logger import logger from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, - add_destination_entry, ) from unstructured.staging.base import flatten_dict from unstructured.utils import requires_dependencies @@ -197,13 +196,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: self.upsert_batch(collection, self.prepare_chroma_list(chunk)) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=ChromaConnectionConfig, - uploader=ChromaUploader, - uploader_config=ChromaUploaderConfig, - upload_stager=ChromaUploadStager, - upload_stager_config=ChromaUploadStagerConfig, - ), +chroma_destination_entry = DestinationRegistryEntry( + connection_config=ChromaConnectionConfig, + uploader=ChromaUploader, + uploader_config=ChromaUploaderConfig, + upload_stager=ChromaUploadStager, + upload_stager_config=ChromaUploadStagerConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/elasticsearch.py b/unstructured/ingest/v2/processes/connectors/elasticsearch.py index 4f34b938cb..4a45bae1b9 100644 --- a/unstructured/ingest/v2/processes/connectors/elasticsearch.py +++ b/unstructured/ingest/v2/processes/connectors/elasticsearch.py @@ -31,8 +31,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.staging.base import flatten_dict from unstructured.utils import requires_dependencies @@ -386,24 +384,18 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: ) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - connection_config=ElasticsearchConnectionConfig, - indexer=ElasticsearchIndexer, - indexer_config=ElasticsearchIndexerConfig, - downloader=ElasticsearchDownloader, - downloader_config=ElasticsearchDownloaderConfig, - ), +elasticsearch_source_entry = SourceRegistryEntry( + connection_config=ElasticsearchConnectionConfig, + indexer=ElasticsearchIndexer, + indexer_config=ElasticsearchIndexerConfig, + downloader=ElasticsearchDownloader, + downloader_config=ElasticsearchDownloaderConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=ElasticsearchConnectionConfig, - upload_stager_config=ElasticsearchUploadStagerConfig, - upload_stager=ElasticsearchUploadStager, - uploader_config=ElasticsearchUploaderConfig, - uploader=ElasticsearchUploader, - ), +elasticsearch_destination_entry = DestinationRegistryEntry( + connection_config=ElasticsearchConnectionConfig, + upload_stager_config=ElasticsearchUploadStagerConfig, + upload_stager=ElasticsearchUploadStager, + uploader_config=ElasticsearchUploaderConfig, + uploader=ElasticsearchUploader, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py b/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py index 9d48db4f9f..eacc0df966 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py @@ -1 +1,37 @@ from __future__ import annotations + +from unstructured.ingest.v2.processes.connector_registry import ( + add_destination_entry, + add_source_entry, +) + +from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE +from .azure import azure_destination_entry, azure_source_entry +from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE +from .box import box_destination_entry, box_source_entry +from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE +from .dropbox import dropbox_destination_entry, dropbox_source_entry +from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE +from .gcs import gcs_destination_entry, gcs_source_entry +from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE +from .s3 import s3_destination_entry, s3_source_entry +from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE +from .sftp import sftp_destination_entry, sftp_source_entry + +add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry) +add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry) + +add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry) +add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry) + +add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry) +add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry) + +add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry) +add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry) + +add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry) +add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry) + +add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry) +add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/azure.py b/unstructured/ingest/v2/processes/connectors/fsspec/azure.py index 4ff1d2f9ab..8dd7566008 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/azure.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/azure.py @@ -9,8 +9,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -131,22 +129,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=AzureIndexer, - indexer_config=AzureIndexerConfig, - downloader=AzureDownloader, - downloader_config=AzureDownloaderConfig, - connection_config=AzureConnectionConfig, - ), +azure_source_entry = SourceRegistryEntry( + indexer=AzureIndexer, + indexer_config=AzureIndexerConfig, + downloader=AzureDownloader, + downloader_config=AzureDownloaderConfig, + connection_config=AzureConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=AzureUploader, - uploader_config=AzureUploaderConfig, - connection_config=AzureConnectionConfig, - ), +azure_destination_entry = DestinationRegistryEntry( + uploader=AzureUploader, + uploader_config=AzureUploaderConfig, + connection_config=AzureConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/box.py b/unstructured/ingest/v2/processes/connectors/fsspec/box.py index ce2907c15e..77d60c79e1 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/box.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/box.py @@ -9,8 +9,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -118,22 +116,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=BoxIndexer, - indexer_config=BoxIndexerConfig, - downloader=BoxDownloader, - downloader_config=BoxDownloaderConfig, - connection_config=BoxConnectionConfig, - ), +box_source_entry = SourceRegistryEntry( + indexer=BoxIndexer, + indexer_config=BoxIndexerConfig, + downloader=BoxDownloader, + downloader_config=BoxDownloaderConfig, + connection_config=BoxConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=BoxUploader, - uploader_config=BoxUploaderConfig, - connection_config=BoxConnectionConfig, - ), +box_destination_entry = DestinationRegistryEntry( + uploader=BoxUploader, + uploader_config=BoxUploaderConfig, + connection_config=BoxConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py b/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py index 78fd6ec54b..96dc3ba712 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py @@ -9,8 +9,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -117,22 +115,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=DropboxIndexer, - indexer_config=DropboxIndexerConfig, - downloader=DropboxDownloader, - downloader_config=DropboxDownloaderConfig, - connection_config=DropboxConnectionConfig, - ), +dropbox_source_entry = SourceRegistryEntry( + indexer=DropboxIndexer, + indexer_config=DropboxIndexerConfig, + downloader=DropboxDownloader, + downloader_config=DropboxDownloaderConfig, + connection_config=DropboxConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=DropboxUploader, - uploader_config=DropboxUploaderConfig, - connection_config=DropboxConnectionConfig, - ), +dropbox_destination_entry = DestinationRegistryEntry( + uploader=DropboxUploader, + uploader_config=DropboxUploaderConfig, + connection_config=DropboxConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py b/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py index e3bebbae47..2c51f1c121 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py @@ -10,8 +10,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -128,22 +126,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=GcsIndexer, - indexer_config=GcsIndexerConfig, - downloader=GcsDownloader, - downloader_config=GcsDownloaderConfig, - connection_config=GcsConnectionConfig, - ), +gcs_source_entry = SourceRegistryEntry( + indexer=GcsIndexer, + indexer_config=GcsIndexerConfig, + downloader=GcsDownloader, + downloader_config=GcsDownloaderConfig, + connection_config=GcsConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=GcsUploader, - uploader_config=GcsUploaderConfig, - connection_config=GcsConnectionConfig, - ), +gcs_destination_entry = DestinationRegistryEntry( + uploader=GcsUploader, + uploader_config=GcsUploaderConfig, + connection_config=GcsConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py b/unstructured/ingest/v2/processes/connectors/fsspec/s3.py index ceb99e3c93..dd8b24409f 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/s3.py @@ -10,8 +10,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -143,22 +141,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=S3Indexer, - indexer_config=S3IndexerConfig, - downloader=S3Downloader, - downloader_config=S3DownloaderConfig, - connection_config=S3ConnectionConfig, - ), +s3_source_entry = SourceRegistryEntry( + indexer=S3Indexer, + indexer_config=S3IndexerConfig, + downloader=S3Downloader, + downloader_config=S3DownloaderConfig, + connection_config=S3ConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=S3Uploader, - uploader_config=S3UploaderConfig, - connection_config=S3ConnectionConfig, - ), +s3_destination_entry = DestinationRegistryEntry( + uploader=S3Uploader, + uploader_config=S3UploaderConfig, + connection_config=S3ConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py b/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py index d4403dd344..d73a22195f 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py @@ -11,8 +11,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( FsspecAccessConfig, @@ -153,22 +151,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non return await super().run_async(path=path, file_data=file_data, **kwargs) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=SftpIndexer, - indexer_config=SftpIndexerConfig, - downloader=SftpDownloader, - downloader_config=SftpDownloaderConfig, - connection_config=SftpConnectionConfig, - ), +sftp_source_entry = SourceRegistryEntry( + indexer=SftpIndexer, + indexer_config=SftpIndexerConfig, + downloader=SftpDownloader, + downloader_config=SftpDownloaderConfig, + connection_config=SftpConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - uploader=SftpUploader, - uploader_config=SftpUploaderConfig, - connection_config=SftpConnectionConfig, - ), +sftp_destination_entry = DestinationRegistryEntry( + uploader=SftpUploader, + uploader_config=SftpUploaderConfig, + connection_config=SftpConnectionConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/google_drive.py b/unstructured/ingest/v2/processes/connectors/google_drive.py index e07d20f1eb..817b7ffece 100644 --- a/unstructured/ingest/v2/processes/connectors/google_drive.py +++ b/unstructured/ingest/v2/processes/connectors/google_drive.py @@ -26,7 +26,6 @@ from unstructured.ingest.v2.logger import logger from unstructured.ingest.v2.processes.connector_registry import ( SourceRegistryEntry, - add_source_entry, ) from unstructured.utils import requires_dependencies @@ -345,13 +344,10 @@ def run(self, file_data: FileData, **kwargs: Any) -> download_responses: return self._write_file(file_data=file_data, file_contents=file_contents) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - connection_config=GoogleDriveConnectionConfig, - indexer_config=GoogleDriveIndexerConfig, - indexer=GoogleDriveIndexer, - downloader_config=GoogleDriveDownloaderConfig, - downloader=GoogleDriveDownloader, - ), +google_drive_source_entry = SourceRegistryEntry( + connection_config=GoogleDriveConnectionConfig, + indexer_config=GoogleDriveIndexerConfig, + indexer=GoogleDriveIndexer, + downloader_config=GoogleDriveDownloaderConfig, + downloader=GoogleDriveDownloader, ) diff --git a/unstructured/ingest/v2/processes/connectors/local.py b/unstructured/ingest/v2/processes/connectors/local.py index 7715fc1f84..811606d79c 100644 --- a/unstructured/ingest/v2/processes/connectors/local.py +++ b/unstructured/ingest/v2/processes/connectors/local.py @@ -25,8 +25,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) CONNECTOR_TYPE = "local" @@ -192,18 +190,14 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: shutil.copy(src=str(content.path), dst=str(final_path)) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - indexer=LocalIndexer, - indexer_config=LocalIndexerConfig, - downloader=LocalDownloader, - downloader_config=LocalDownloaderConfig, - connection_config=LocalConnectionConfig, - ), +local_source_entry = SourceRegistryEntry( + indexer=LocalIndexer, + indexer_config=LocalIndexerConfig, + downloader=LocalDownloader, + downloader_config=LocalDownloaderConfig, + connection_config=LocalConnectionConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry(uploader=LocalUploader, uploader_config=LocalUploaderConfig), +local_destination_entry = DestinationRegistryEntry( + uploader=LocalUploader, uploader_config=LocalUploaderConfig ) diff --git a/unstructured/ingest/v2/processes/connectors/onedrive.py b/unstructured/ingest/v2/processes/connectors/onedrive.py index 43ae6a98c3..348b7467bf 100644 --- a/unstructured/ingest/v2/processes/connectors/onedrive.py +++ b/unstructured/ingest/v2/processes/connectors/onedrive.py @@ -25,7 +25,6 @@ from unstructured.ingest.v2.logger import logger from unstructured.ingest.v2.processes.connector_registry import ( SourceRegistryEntry, - add_source_entry, ) from unstructured.utils import requires_dependencies @@ -231,13 +230,10 @@ def run(self, file_data: FileData, **kwargs: Any) -> download_responses: return DownloadResponse(file_data=file_data, path=download_path) -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - connection_config=OnedriveConnectionConfig, - indexer_config=OnedriveIndexerConfig, - indexer=OnedriveIndexer, - downloader_config=OnedriveDownloaderConfig, - downloader=OnedriveDownloader, - ), +onedrive_source_entry = SourceRegistryEntry( + connection_config=OnedriveConnectionConfig, + indexer_config=OnedriveIndexerConfig, + indexer=OnedriveIndexer, + downloader_config=OnedriveDownloaderConfig, + downloader=OnedriveDownloader, ) diff --git a/unstructured/ingest/v2/processes/connectors/opensearch.py b/unstructured/ingest/v2/processes/connectors/opensearch.py index 34cc96f550..0933cd1fad 100644 --- a/unstructured/ingest/v2/processes/connectors/opensearch.py +++ b/unstructured/ingest/v2/processes/connectors/opensearch.py @@ -13,8 +13,6 @@ from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, SourceRegistryEntry, - add_destination_entry, - add_source_entry, ) from unstructured.ingest.v2.processes.connectors.elasticsearch import ( ElasticsearchDownloader, @@ -139,23 +137,19 @@ def load_parallel_bulk(self): return parallel_bulk -add_source_entry( - source_type=CONNECTOR_TYPE, - entry=SourceRegistryEntry( - connection_config=OpenSearchConnectionConfig, - indexer=OpenSearchIndexer, - indexer_config=ElasticsearchIndexerConfig, - downloader=OpenSearchDownloader, - downloader_config=ElasticsearchDownloaderConfig, - ), +opensearch_source_entry = SourceRegistryEntry( + connection_config=OpenSearchConnectionConfig, + indexer=OpenSearchIndexer, + indexer_config=ElasticsearchIndexerConfig, + downloader=OpenSearchDownloader, + downloader_config=ElasticsearchDownloaderConfig, ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=OpenSearchConnectionConfig, - upload_stager_config=ElasticsearchUploadStagerConfig, - upload_stager=ElasticsearchUploadStager, - uploader_config=ElasticsearchUploaderConfig, - uploader=OpenSearchUploader, - ), + + +opensearch_destination_entry = DestinationRegistryEntry( + connection_config=OpenSearchConnectionConfig, + upload_stager_config=ElasticsearchUploadStagerConfig, + upload_stager=ElasticsearchUploadStager, + uploader_config=ElasticsearchUploaderConfig, + uploader=OpenSearchUploader, ) diff --git a/unstructured/ingest/v2/processes/connectors/weaviate.py b/unstructured/ingest/v2/processes/connectors/weaviate.py index 8e8672b3aa..f89cedb3af 100644 --- a/unstructured/ingest/v2/processes/connectors/weaviate.py +++ b/unstructured/ingest/v2/processes/connectors/weaviate.py @@ -20,7 +20,6 @@ from unstructured.ingest.v2.logger import logger from unstructured.ingest.v2.processes.connector_registry import ( DestinationRegistryEntry, - add_destination_entry, ) if TYPE_CHECKING: @@ -224,13 +223,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: ) -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=WeaviateConnectionConfig, - uploader=WeaviateUploader, - uploader_config=WeaviateUploaderConfig, - upload_stager=WeaviateUploadStager, - upload_stager_config=WeaviateUploadStagerConfig, - ), +weaviate_destination_entry = DestinationRegistryEntry( + connection_config=WeaviateConnectionConfig, + uploader=WeaviateUploader, + uploader_config=WeaviateUploaderConfig, + upload_stager=WeaviateUploadStager, + upload_stager_config=WeaviateUploadStagerConfig, ) From 3c64593d7792248ed51e107b8d8c7eb63b56c4d8 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Mon, 1 Jul 2024 10:06:08 -0400 Subject: [PATCH 2/3] Don't remove unused import to pull in fsspec connectors into registry --- unstructured/ingest/v2/processes/connectors/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unstructured/ingest/v2/processes/connectors/__init__.py b/unstructured/ingest/v2/processes/connectors/__init__.py index 43683ce9d9..a661953f13 100644 --- a/unstructured/ingest/v2/processes/connectors/__init__.py +++ b/unstructured/ingest/v2/processes/connectors/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401 from unstructured.ingest.v2.processes.connector_registry import ( add_destination_entry, add_source_entry, From 85dc8f64375a378d7794423e478bf08bec35a63a Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Mon, 1 Jul 2024 15:02:20 -0400 Subject: [PATCH 3/3] bump changelog --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1abbc38e3f..b3bd8e630a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.10-dev4 +## 0.14.10-dev5 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 70050cabeb..b952b69f8c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.10-dev4" # pragma: no cover +__version__ = "0.14.10-dev5" # pragma: no cover