Skip to content

Commit

Permalink
bug CORE-4225: mongodb url bug (Unstructured-IO#2662)
Browse files Browse the repository at this point in the history
The mongodb redact method was created because we wanted part of the url
to be exposed to the user during logging. Thus it did not use the
dataclass `enhanced_field(sensitive=True)` solution.

This changes it to use our standard redacted solution. This also
minimizes the amount of work to be done in platform.
  • Loading branch information
potter-potter authored and kaaloo committed Apr 8, 2024
1 parent 00a49ea commit bd26697
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 83 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.0-dev12
## 0.13.0-dev13

### Enhancements

Expand All @@ -20,6 +20,7 @@
* **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint
* **Adds tracking for AstraDB** Adds tracking info so AstraDB can see what source called their api.
* **Support AWS Bedrock Embeddings in ingest CLI** The configs required to instantiate the bedrock embedding class are now exposed in the api and the version of boto being used meets the minimum requirement to introduce the bedrock runtime required to hit the service.
* **Change MongoDB redacting** Original redact secrets solution is causing issues in platform. This fix uses our standard logging redact solution.

## 0.12.6

Expand Down
3 changes: 2 additions & 1 deletion test_unstructured_ingest/dest/vectara.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
CORPUS_NAME="test-corpus-vectara-"$RANDOM_SUFFIX

# Expected size of the uploaded document
EXPECTED_CORPUS_SIZE=8830076
EXPECTED_CORPUS_SIZE=8842684

if [ -z "$VECTARA_OAUTH_CLIENT_ID" ] && [ -z "$VECTARA_OAUTH_SECRET" ] && [ -z "$VECTARA_CUSTOMER_ID" ]; then
echo "Skipping VECTARA ingest test because VECTARA_OAUTH_CLIENT_ID, VECTARA_OAUTH_SECRET, or VECTARA_CUSTOMER_ID env var is not set."
Expand Down Expand Up @@ -89,5 +89,6 @@ if [ "$corpus_size" == "$EXPECTED_CORPUS_SIZE" ]; then
echo "Corpus size is as expected: $corpus_size"
else
echo "Corpus size is not as expected: $corpus_size"
echo "vs $EXPECTED_CORPUS_SIZE"
exit 1
fi
16 changes: 0 additions & 16 deletions test_unstructured_ingest/unit/test_common.py

This file was deleted.

2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.0-dev12" # pragma: no cover
__version__ = "0.13.0-dev13" # pragma: no cover
13 changes: 1 addition & 12 deletions unstructured/ingest/cli/common.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
import logging

from unstructured.ingest.logger import ingest_log_streaming_init, logger


def options_redactions(options: dict) -> dict:
# handle any logic needed to redact not already caught by the logging filter
options = options.copy()
if "uri" in options and options["uri"].startswith("mongodb"):
from unstructured.ingest.connector.mongodb import redact

options["uri"] = redact(options["uri"])
return options
from unstructured.ingest.logger import ingest_log_streaming_init


def log_options(options: dict, verbose=False):
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
logger.debug(f"options: {options_redactions(options)}")
59 changes: 8 additions & 51 deletions unstructured/ingest/connector/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from dataclasses import dataclass, field
from pathlib import Path

from dataclasses_json.core import Json

from unstructured.__version__ import __version__ as unstructured_version
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.enhanced_dataclass.core import _asdict
from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
from unstructured.ingest.interfaces import (
AccessConfig,
BaseConnectorConfig,
BaseDestinationConnector,
BaseIngestDocBatch,
Expand All @@ -34,72 +34,29 @@ def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
return user, passwd


def redact(uri: str, redacted_text="***REDACTED***") -> str:
"""
Cherry pick code from pymongo.uri_parser.parse_uri to only extract password and
redact without needing to import pymongo library
"""

SCHEME = "mongodb://"
SRV_SCHEME = "mongodb+srv://"
if uri.startswith(SCHEME):
scheme_free = uri[len(SCHEME) :] # noqa: E203
elif uri.startswith(SRV_SCHEME):
scheme_free = uri[len(SRV_SCHEME) :] # noqa: E203
else:
raise ValueError(f"Invalid URI scheme: URI must begin with '{SCHEME}' or '{SRV_SCHEME}'")

passwd = None

host_part, _, path_part = scheme_free.partition("/")
if not host_part:
host_part = path_part
path_part = ""

if not path_part:
# There was no slash in scheme_free, check for a sole "?".
host_part, _, _ = host_part.partition("?")

if "@" in host_part:
userinfo, _, hosts = host_part.rpartition("@")
_, passwd = parse_userinfo(userinfo)

if passwd:
uri = uri.replace(passwd, redacted_text)
return uri
@dataclass
class MongoDBAccessConfig(AccessConfig):
uri: t.Optional[str] = enhanced_field(sensitive=True, default=None)


@dataclass
class SimpleMongoDBConfig(BaseConnectorConfig):
uri: t.Optional[str] = None
access_config: MongoDBAccessConfig
host: t.Optional[str] = None
database: t.Optional[str] = None
collection: t.Optional[str] = None
port: int = 27017
batch_size: int = 100

def to_dict(
self, redact_sensitive=False, redacted_text="***REDACTED***", **kwargs
) -> t.Dict[str, Json]:
d = super().to_dict(
redact_sensitive=redact_sensitive, redacted_text=redacted_text, **kwargs
)
if redact_sensitive:
if self.host:
d["host"] = redact(uri=self.host, redacted_text=redacted_text)
if self.uri:
d["uri"] = redact(uri=self.uri, redacted_text=redacted_text)
return d

@requires_dependencies(["pymongo"], extras="mongodb")
def generate_client(self) -> "MongoClient":
from pymongo import MongoClient
from pymongo.driver_info import DriverInfo
from pymongo.server_api import ServerApi

if self.uri:
if self.access_config.uri:
return MongoClient(
self.uri,
self.access_config.uri,
server_api=ServerApi(version=SERVER_API_VERSION),
driver=DriverInfo(name="unstructured", version=unstructured_version),
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/runner/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MongoDBRunner(Runner):

def update_read_config(self):
hashed_dir_name = hashlib.sha256(
str(self.connector_config.uri).encode("utf-8"),
str(self.connector_config.access_config.uri).encode("utf-8"),
)
self.read_config.download_dir = update_download_dir_hash(
connector_name="mongodb",
Expand Down

0 comments on commit bd26697

Please sign in to comment.