Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: allow AstraDB to prevent indexing on metadata columns with long text #3003

Merged
merged 8 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.8-dev2
## 0.13.8-dev3

### Enhancements

Expand All @@ -11,6 +11,7 @@
* **Add missing starting_page_num param to partition_image**
* **Make the filename and file params for partition_image and partition_pdf match the other partitioners**
* **Re-apply: skip accuracy calculation feature** Overwritten by mistake
* **AstraDB: opton to prevent indexing metadata**

## 0.13.7

Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ include requirements/huggingface.in

# Ingest extras
include requirements/ingest/airtable.in
include requirements/ingest/astra.in
include requirements/ingest/azure-cognitive-search.in
include requirements/ingest/azure.in
include requirements/ingest/biomed.in
Expand Down
3 changes: 2 additions & 1 deletion test_unstructured_ingest/dest/astra.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--token "$ASTRA_DB_TOKEN" \
--api-endpoint "$ASTRA_DB_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \
--embedding-dimension "$EMBEDDING_DIMENSION"
--embedding-dimension "$EMBEDDING_DIMENSION" \
--requested-indexing-policy '{"deny": "metadata"}'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erichare Can you shed some light on why this --requested-indexing-policy results in an error. Maybe there is better json to test this attribute.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erichare Switched this pr from draft to normal. Just in case you couldn't access the draft.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@potter-potter apologies about the delay - i was out most of yesterday. Let me test that shortly - offhand that looks right, assuming that it gets converted into a dictionary correctly, but its kind of a deeply nested object on the Astra Data API side and so i may have missed a top level attribute. Checking it out shortly...

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@potter-potter i opened a small bugfix pr to merge into this branch (didn't have write access to this). It was my mistake - i was passing a single field, but the API expects a list of fields even if it's just a single one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erichare your fix looks good.


python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \
--token "$ASTRA_DB_TOKEN" \
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev2" # pragma: no cover
__version__ = "0.13.8-dev3" # pragma: no cover
18 changes: 17 additions & 1 deletion unstructured/ingest/cli/cmds/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import click

from unstructured.ingest.cli.interfaces import CliConfig
from unstructured.ingest.cli.interfaces import CliConfig, Dict
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig


Expand Down Expand Up @@ -38,10 +38,26 @@ def get_cli_options() -> t.List[click.Option]:
),
click.Option(
["--embedding-dimension"],
required=True,
default=384,
type=int,
help="The dimensionality of the embeddings",
),
click.Option(
["--namespace"],
required=False,
default=None,
type=str,
help="The Astra DB namespace to write into.",
),
click.Option(
["--requested-indexing-policy"],
required=False,
default=None,
type=Dict(),
help="The indexing policy to use for the collection."
'example: \'{"deny": "metadata"}\' ',
),
]
return options

Expand Down
30 changes: 19 additions & 11 deletions unstructured/ingest/connector/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from unstructured.__version__ import __version__ as integration_version
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.enhanced_dataclass.core import _asdict
from unstructured.ingest.error import DestinationConnectionError, SourceConnectionNetworkError
from unstructured.ingest.error import DestinationConnectionError
from unstructured.ingest.interfaces import (
AccessConfig,
BaseConnectorConfig,
Expand All @@ -15,7 +15,6 @@
)
from unstructured.ingest.logger import logger
from unstructured.ingest.utils.data_prep import chunk_generator
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies

if t.TYPE_CHECKING:
Expand All @@ -26,15 +25,17 @@

@dataclass
class AstraAccessConfig(AccessConfig):
token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
api_endpoint: t.Optional[str] = enhanced_field(default=None, sensitive=True)
token: str = enhanced_field(sensitive=True)
api_endpoint: str = enhanced_field(sensitive=True)


@dataclass
class SimpleAstraConfig(BaseConnectorConfig):
access_config: AstraAccessConfig
collection_name: str
embedding_dimension: int
namespace: t.Optional[str] = None
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None


@dataclass
Expand Down Expand Up @@ -69,20 +70,29 @@ def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None:
from astrapy.db import AstraDB

# Get the collection_name and embedding dimension
collection_name = self.connector_config.collection_name
embedding_dimension = self.connector_config.embedding_dimension
requested_indexing_policy = self.connector_config.requested_indexing_policy

# If the user has requested an indexing policy, pass it to the AstraDB
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None

# Build the Astra DB object.
# caller_name/version for AstraDB tracking
self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token,
namespace=self.connector_config.namespace,
caller_name=integration_name,
caller_version=integration_version,
)

# Create and connect to the newly created collection
self._astra_db_collection = self._astra_db.create_collection(
collection_name=self.connector_config.collection_name,
dimension=self.connector_config.embedding_dimension,
options={"indexing": {"deny": NON_INDEXED_FIELDS}},
collection_name=collection_name,
dimension=embedding_dimension,
options=options,
)
return self._astra_db_collection

Expand All @@ -97,7 +107,7 @@ def check_connection(self):
_ = self.astra_db_collection
except Exception as e:
logger.error(f"Failed to validate connection {e}", exc_info=True)
raise SourceConnectionNetworkError(f"failed to validate connection: {e}")
raise DestinationConnectionError(f"failed to validate connection: {e}")

def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
Expand All @@ -111,7 +121,5 @@ def normalize_dict(self, element_dict: dict) -> dict:
return {
"$vector": element_dict.pop("embeddings", None),
"content": element_dict.pop("text", None),
"metadata": flatten_dict(
element_dict, separator="-", flatten_lists=True, remove_none=True
),
"metadata": element_dict,
}