Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix/isolate ingest v2 dependencies #3327

Merged
merged 3 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.10-dev4
## 0.14.10-dev5

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.10-dev4" # pragma: no cover
__version__ = "0.14.10-dev5" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/base/dest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import click

from unstructured.ingest.cli.utils import conform_click_options
from unstructured.ingest.v2.cli.base.cmd import BaseCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import conform_click_options
from unstructured.ingest.v2.logger import logger


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/base/src.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import click

from unstructured.ingest.cli.utils import Group, conform_click_options
from unstructured.ingest.v2.cli.base.cmd import BaseCmd
from unstructured.ingest.v2.cli.configs import (
ChunkerCliConfig,
Expand All @@ -13,6 +12,7 @@
ProcessorCliConfig,
)
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import Group, conform_click_options
from unstructured.ingest.v2.logger import logger


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/cmds/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

import click

from unstructured.ingest.cli.interfaces import Dict
from unstructured.ingest.v2.cli.base import DestCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import Dict
from unstructured.ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/cli/configs/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import click

from unstructured.ingest.cli.interfaces import DelimitedString, Dict
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import DelimitedString, Dict


@dataclass
Expand Down
59 changes: 59 additions & 0 deletions unstructured/ingest/v2/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from dataclasses import fields, is_dataclass
from gettext import gettext, ngettext
from gettext import gettext as _
rbiseck3 marked this conversation as resolved.
Show resolved Hide resolved
from pathlib import Path
from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin

Expand All @@ -12,6 +13,13 @@
from unstructured.ingest.v2.logger import logger


def conform_click_options(options: dict):
# Click sets all multiple fields as tuple, this needs to be updated to list
for k, v in options.items():
if isinstance(v, tuple):
options[k] = list(v)


class Dict(click.ParamType):
name = "dict"

Expand Down Expand Up @@ -179,3 +187,54 @@ def is_subclass(instance, class_type) -> bool:

adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
return config.from_dict(adjusted_dict, apply_name_overload=False)


class Group(click.Group):
def parse_args(self, ctx, args):
"""
This allows for subcommands to be called with the --help flag without breaking
if parent command is missing any of its required parameters
"""

try:
return super().parse_args(ctx, args)
except click.MissingParameter:
if "--help" not in args:
raise

# remove the required params so that help can display
for param in self.params:
param.required = False
return super().parse_args(ctx, args)

def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
"""
Copy of the original click.Group format_commands() method but replacing
'Commands' -> 'Destinations'
"""
commands = []
for subcommand in self.list_commands(ctx):
cmd = self.get_command(ctx, subcommand)
# What is this, the tool lied about a command. Ignore it
if cmd is None:
continue
if cmd.hidden:
continue

commands.append((subcommand, cmd))

# allow for 3 times the default spacing
if len(commands):
if formatter.width:
limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands)
else:
limit = -6 - max(len(cmd[0]) for cmd in commands)

rows = []
for subcommand, cmd in commands:
help = cmd.get_short_help_str(limit)
rows.append((subcommand, help))

if rows:
with formatter.section(_("Destinations")):
formatter.write_dl(rows)
46 changes: 46 additions & 0 deletions unstructured/ingest/v2/processes/connectors/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,47 @@
from __future__ import annotations

import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401
from unstructured.ingest.v2.processes.connector_registry import (
add_destination_entry,
add_source_entry,
)

from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
from .astra import astra_destination_entry
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
from .chroma import chroma_destination_entry
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
from .google_drive import google_drive_source_entry
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
from .local import local_destination_entry, local_source_entry
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
from .onedrive import onedrive_source_entry
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
from .opensearch import opensearch_destination_entry, opensearch_source_entry
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
from .weaviate import weaviate_destination_entry

add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)

add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)

add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
add_destination_entry(
destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
)

add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)

add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)

add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)

add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
add_destination_entry(
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
)

add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
16 changes: 6 additions & 10 deletions unstructured/ingest/v2/processes/connectors/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
add_destination_entry,
)
from unstructured.utils import requires_dependencies

Expand Down Expand Up @@ -143,13 +142,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
collection.insert_many(chunk)


add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=AstraConnectionConfig,
upload_stager_config=AstraUploadStagerConfig,
upload_stager=AstraUploadStager,
uploader_config=AstraUploaderConfig,
uploader=AstraUploader,
),
astra_destination_entry = DestinationRegistryEntry(
connection_config=AstraConnectionConfig,
upload_stager_config=AstraUploadStagerConfig,
upload_stager=AstraUploadStager,
uploader_config=AstraUploaderConfig,
uploader=AstraUploader,
)
16 changes: 6 additions & 10 deletions unstructured/ingest/v2/processes/connectors/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
add_destination_entry,
)
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies
Expand Down Expand Up @@ -197,13 +196,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
self.upsert_batch(collection, self.prepare_chroma_list(chunk))


add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=ChromaConnectionConfig,
uploader=ChromaUploader,
uploader_config=ChromaUploaderConfig,
upload_stager=ChromaUploadStager,
upload_stager_config=ChromaUploadStagerConfig,
),
chroma_destination_entry = DestinationRegistryEntry(
connection_config=ChromaConnectionConfig,
uploader=ChromaUploader,
uploader_config=ChromaUploaderConfig,
upload_stager=ChromaUploadStager,
upload_stager_config=ChromaUploadStagerConfig,
)
32 changes: 12 additions & 20 deletions unstructured/ingest/v2/processes/connectors/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
SourceRegistryEntry,
add_destination_entry,
add_source_entry,
)
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies
Expand Down Expand Up @@ -386,24 +384,18 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
)


add_source_entry(
source_type=CONNECTOR_TYPE,
entry=SourceRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
indexer=ElasticsearchIndexer,
indexer_config=ElasticsearchIndexerConfig,
downloader=ElasticsearchDownloader,
downloader_config=ElasticsearchDownloaderConfig,
),
elasticsearch_source_entry = SourceRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
indexer=ElasticsearchIndexer,
indexer_config=ElasticsearchIndexerConfig,
downloader=ElasticsearchDownloader,
downloader_config=ElasticsearchDownloaderConfig,
)

add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
upload_stager_config=ElasticsearchUploadStagerConfig,
upload_stager=ElasticsearchUploadStager,
uploader_config=ElasticsearchUploaderConfig,
uploader=ElasticsearchUploader,
),
elasticsearch_destination_entry = DestinationRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
upload_stager_config=ElasticsearchUploadStagerConfig,
upload_stager=ElasticsearchUploadStager,
uploader_config=ElasticsearchUploaderConfig,
uploader=ElasticsearchUploader,
)
36 changes: 36 additions & 0 deletions unstructured/ingest/v2/processes/connectors/fsspec/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
from __future__ import annotations

from unstructured.ingest.v2.processes.connector_registry import (
add_destination_entry,
add_source_entry,
)

from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
from .azure import azure_destination_entry, azure_source_entry
from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
from .box import box_destination_entry, box_source_entry
from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
from .dropbox import dropbox_destination_entry, dropbox_source_entry
from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
from .gcs import gcs_destination_entry, gcs_source_entry
from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
from .s3 import s3_destination_entry, s3_source_entry
from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
from .sftp import sftp_destination_entry, sftp_source_entry

add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)

add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)

add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)

add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)

add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)

add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
28 changes: 10 additions & 18 deletions unstructured/ingest/v2/processes/connectors/fsspec/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
SourceRegistryEntry,
add_destination_entry,
add_source_entry,
)
from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import (
FsspecAccessConfig,
Expand Down Expand Up @@ -131,22 +129,16 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non
return await super().run_async(path=path, file_data=file_data, **kwargs)


add_source_entry(
source_type=CONNECTOR_TYPE,
entry=SourceRegistryEntry(
indexer=AzureIndexer,
indexer_config=AzureIndexerConfig,
downloader=AzureDownloader,
downloader_config=AzureDownloaderConfig,
connection_config=AzureConnectionConfig,
),
azure_source_entry = SourceRegistryEntry(
indexer=AzureIndexer,
indexer_config=AzureIndexerConfig,
downloader=AzureDownloader,
downloader_config=AzureDownloaderConfig,
connection_config=AzureConnectionConfig,
)

add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
uploader=AzureUploader,
uploader_config=AzureUploaderConfig,
connection_config=AzureConnectionConfig,
),
azure_destination_entry = DestinationRegistryEntry(
uploader=AzureUploader,
uploader_config=AzureUploaderConfig,
connection_config=AzureConnectionConfig,
)
Loading
Loading