Skip to content

Commit

Permalink
bugfix/isolate ingest v2 dependencies (#3327)
Browse files Browse the repository at this point in the history
### Description
This PR handles two things:
* Exposing all the connectors via the connector registries by simply
importing the connector module. This should be safe assuming all
connector specific dependencies themselves are imported in the methods
where they are used and wrapped in `@requires_dependencies` decorator
* Remove any import that pulls from the v2 ingest.cli package
  • Loading branch information
rbiseck3 authored Jul 2, 2024
1 parent 5d89b41 commit c28deff
Show file tree
Hide file tree
Showing 23 changed files with 271 additions and 218 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.10-dev4
## 0.14.10-dev5

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.10-dev4" # pragma: no cover
__version__ = "0.14.10-dev5" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/base/dest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import click

from unstructured.ingest.cli.utils import conform_click_options
from unstructured.ingest.v2.cli.base.cmd import BaseCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import conform_click_options
from unstructured.ingest.v2.logger import logger


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/base/src.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import click

from unstructured.ingest.cli.utils import Group, conform_click_options
from unstructured.ingest.v2.cli.base.cmd import BaseCmd
from unstructured.ingest.v2.cli.configs import (
ChunkerCliConfig,
Expand All @@ -13,6 +12,7 @@
ProcessorCliConfig,
)
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import Group, conform_click_options
from unstructured.ingest.v2.logger import logger


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/cmds/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

import click

from unstructured.ingest.cli.interfaces import Dict
from unstructured.ingest.v2.cli.base import DestCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import Dict
from unstructured.ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/configs/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import click

from unstructured.ingest.cli.interfaces import DelimitedString, Dict
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import DelimitedString, Dict


@dataclass
Expand Down
59 changes: 59 additions & 0 deletions unstructured/ingest/v2/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from dataclasses import fields, is_dataclass
from gettext import gettext, ngettext
from gettext import gettext as _
from pathlib import Path
from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin

Expand All @@ -12,6 +13,13 @@
from unstructured.ingest.v2.logger import logger


def conform_click_options(options: dict):
# Click sets all multiple fields as tuple, this needs to be updated to list
for k, v in options.items():
if isinstance(v, tuple):
options[k] = list(v)


class Dict(click.ParamType):
name = "dict"

Expand Down Expand Up @@ -179,3 +187,54 @@ def is_subclass(instance, class_type) -> bool:

adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
return config.from_dict(adjusted_dict, apply_name_overload=False)


class Group(click.Group):
def parse_args(self, ctx, args):
"""
This allows for subcommands to be called with the --help flag without breaking
if parent command is missing any of its required parameters
"""

try:
return super().parse_args(ctx, args)
except click.MissingParameter:
if "--help" not in args:
raise

# remove the required params so that help can display
for param in self.params:
param.required = False
return super().parse_args(ctx, args)

def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
"""
Copy of the original click.Group format_commands() method but replacing
'Commands' -> 'Destinations'
"""
commands = []
for subcommand in self.list_commands(ctx):
cmd = self.get_command(ctx, subcommand)
# What is this, the tool lied about a command. Ignore it
if cmd is None:
continue
if cmd.hidden:
continue

commands.append((subcommand, cmd))

# allow for 3 times the default spacing
if len(commands):
if formatter.width:
limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands)
else:
limit = -6 - max(len(cmd[0]) for cmd in commands)

rows = []
for subcommand, cmd in commands:
help = cmd.get_short_help_str(limit)
rows.append((subcommand, help))

if rows:
with formatter.section(_("Destinations")):
formatter.write_dl(rows)
46 changes: 46 additions & 0 deletions unstructured/ingest/v2/processes/connectors/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,47 @@
from __future__ import annotations

import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401
from unstructured.ingest.v2.processes.connector_registry import (
add_destination_entry,
add_source_entry,
)

from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
from .astra import astra_destination_entry
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
from .chroma import chroma_destination_entry
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
from .google_drive import google_drive_source_entry
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
from .local import local_destination_entry, local_source_entry
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
from .onedrive import onedrive_source_entry
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
from .opensearch import opensearch_destination_entry, opensearch_source_entry
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
from .weaviate import weaviate_destination_entry

add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)

add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)

add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
add_destination_entry(
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
)

add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)

add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)

add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)

add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
add_destination_entry(
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
)

add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
16 changes: 6 additions & 10 deletions unstructured/ingest/v2/processes/connectors/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
add_destination_entry,
)
from unstructured.utils import requires_dependencies

Expand Down Expand Up @@ -143,13 +142,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
collection.insert_many(chunk)


add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=AstraConnectionConfig,
upload_stager_config=AstraUploadStagerConfig,
upload_stager=AstraUploadStager,
uploader_config=AstraUploaderConfig,
uploader=AstraUploader,
),
astra_destination_entry = DestinationRegistryEntry(
connection_config=AstraConnectionConfig,
upload_stager_config=AstraUploadStagerConfig,
upload_stager=AstraUploadStager,
uploader_config=AstraUploaderConfig,
uploader=AstraUploader,
)
16 changes: 6 additions & 10 deletions unstructured/ingest/v2/processes/connectors/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
add_destination_entry,
)
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies
Expand Down Expand Up @@ -197,13 +196,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
self.upsert_batch(collection, self.prepare_chroma_list(chunk))


add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=ChromaConnectionConfig,
uploader=ChromaUploader,
uploader_config=ChromaUploaderConfig,
upload_stager=ChromaUploadStager,
upload_stager_config=ChromaUploadStagerConfig,
),
chroma_destination_entry = DestinationRegistryEntry(
connection_config=ChromaConnectionConfig,
uploader=ChromaUploader,
uploader_config=ChromaUploaderConfig,
upload_stager=ChromaUploadStager,
upload_stager_config=ChromaUploadStagerConfig,
)
32 changes: 12 additions & 20 deletions unstructured/ingest/v2/processes/connectors/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
SourceRegistryEntry,
add_destination_entry,
add_source_entry,
)
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies
Expand Down Expand Up @@ -386,24 +384,18 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
)


add_source_entry(
source_type=CONNECTOR_TYPE,
entry=SourceRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
indexer=ElasticsearchIndexer,
indexer_config=ElasticsearchIndexerConfig,
downloader=ElasticsearchDownloader,
downloader_config=ElasticsearchDownloaderConfig,
),
elasticsearch_source_entry = SourceRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
indexer=ElasticsearchIndexer,
indexer_config=ElasticsearchIndexerConfig,
downloader=ElasticsearchDownloader,
downloader_config=ElasticsearchDownloaderConfig,
)

add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
upload_stager_config=ElasticsearchUploadStagerConfig,
upload_stager=ElasticsearchUploadStager,
uploader_config=ElasticsearchUploaderConfig,
uploader=ElasticsearchUploader,
),
elasticsearch_destination_entry = DestinationRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
upload_stager_config=ElasticsearchUploadStagerConfig,
upload_stager=ElasticsearchUploadStager,
uploader_config=ElasticsearchUploaderConfig,
uploader=ElasticsearchUploader,
)
36 changes: 36 additions & 0 deletions unstructured/ingest/v2/processes/connectors/fsspec/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
from __future__ import annotations

from unstructured.ingest.v2.processes.connector_registry import (
add_destination_entry,
add_source_entry,
)

from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
from .azure import azure_destination_entry, azure_source_entry
from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
from .box import box_destination_entry, box_source_entry
from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
from .dropbox import dropbox_destination_entry, dropbox_source_entry
from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
from .gcs import gcs_destination_entry, gcs_source_entry
from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
from .s3 import s3_destination_entry, s3_source_entry
from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
from .sftp import sftp_destination_entry, sftp_source_entry

add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)

add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)

add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)

add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)

add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)

add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
28 changes: 10 additions & 18 deletions unstructured/ingest/v2/processes/connectors/fsspec/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
SourceRegistryEntry,
add_destination_entry,
add_source_entry,
)
from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import (
FsspecAccessConfig,
Expand Down Expand Up @@ -131,22 +129,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non
return await super().run_async(path=path, file_data=file_data, **kwargs)


add_source_entry(
source_type=CONNECTOR_TYPE,
entry=SourceRegistryEntry(
indexer=AzureIndexer,
indexer_config=AzureIndexerConfig,
downloader=AzureDownloader,
downloader_config=AzureDownloaderConfig,
connection_config=AzureConnectionConfig,
),
azure_source_entry = SourceRegistryEntry(
indexer=AzureIndexer,
indexer_config=AzureIndexerConfig,
downloader=AzureDownloader,
downloader_config=AzureDownloaderConfig,
connection_config=AzureConnectionConfig,
)

add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
uploader=AzureUploader,
uploader_config=AzureUploaderConfig,
connection_config=AzureConnectionConfig,
),
azure_destination_entry = DestinationRegistryEntry(
uploader=AzureUploader,
uploader_config=AzureUploaderConfig,
connection_config=AzureConnectionConfig,
)
Loading

0 comments on commit c28deff

Please sign in to comment.