From bd266979f93deb013f260e26b0230d6f637a0a68 Mon Sep 17 00:00:00 2001 From: David Potter Date: Thu, 28 Mar 2024 11:38:50 -0700 Subject: [PATCH] bug CORE-4225: mongodb url bug (#2662) The mongodb redact method was created because we wanted part of the url to be exposed to the user during logging. Thus it did not use the dataclass `enhanced_field(sensitive=True)` solution. This changes it to use our standard redacted solution. This also minimizes the amount of work to be done in platform. --- CHANGELOG.md | 3 +- test_unstructured_ingest/dest/vectara.sh | 3 +- test_unstructured_ingest/unit/test_common.py | 16 ------ unstructured/__version__.py | 2 +- unstructured/ingest/cli/common.py | 13 +---- unstructured/ingest/connector/mongodb.py | 59 +++----------------- unstructured/ingest/runner/mongodb.py | 2 +- 7 files changed, 15 insertions(+), 83 deletions(-) delete mode 100644 test_unstructured_ingest/unit/test_common.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a78768256..fbc9ace711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.0-dev12 +## 0.13.0-dev13 ### Enhancements @@ -20,6 +20,7 @@ * **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint * **Adds tracking for AstraDB** Adds tracking info so AstraDB can see what source called their api. * **Support AWS Bedrock Embeddings in ingest CLI** The configs required to instantiate the bedrock embedding class are now exposed in the api and the version of boto being used meets the minimum requirement to introduce the bedrock runtime required to hit the service. +* **Change MongoDB redacting** Original redact secrets solution is causing issues in platform. This fix uses our standard logging redact solution. ## 0.12.6 diff --git a/test_unstructured_ingest/dest/vectara.sh b/test_unstructured_ingest/dest/vectara.sh index 861946a4bd..0db1674800 100755 --- a/test_unstructured_ingest/dest/vectara.sh +++ b/test_unstructured_ingest/dest/vectara.sh @@ -14,7 +14,7 @@ RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) CORPUS_NAME="test-corpus-vectara-"$RANDOM_SUFFIX # Expected size of the uploaded document -EXPECTED_CORPUS_SIZE=8830076 +EXPECTED_CORPUS_SIZE=8842684 if [ -z "$VECTARA_OAUTH_CLIENT_ID" ] && [ -z "$VECTARA_OAUTH_SECRET" ] && [ -z "$VECTARA_CUSTOMER_ID" ]; then echo "Skipping VECTARA ingest test because VECTARA_OAUTH_CLIENT_ID, VECTARA_OAUTH_SECRET, or VECTARA_CUSTOMER_ID env var is not set." @@ -89,5 +89,6 @@ if [ "$corpus_size" == "$EXPECTED_CORPUS_SIZE" ]; then echo "Corpus size is as expected: $corpus_size" else echo "Corpus size is not as expected: $corpus_size" + echo "vs $EXPECTED_CORPUS_SIZE" exit 1 fi diff --git a/test_unstructured_ingest/unit/test_common.py b/test_unstructured_ingest/unit/test_common.py deleted file mode 100644 index 49a1b56069..0000000000 --- a/test_unstructured_ingest/unit/test_common.py +++ /dev/null @@ -1,16 +0,0 @@ -from unstructured.ingest.cli.common import options_redactions - - -def test_options_redactions(): - given_options = { - "uri": "mongodb+srv://myDatabaseUser:D1fficultP%40ssw0rd@mongodb0.example.com/" - "?authSource=admin&replicaSet=myRepl" - } - - when_options = options_redactions(options=given_options) - - assert given_options["uri"] != when_options["uri"] - assert ( - when_options["uri"] == "mongodb+srv://myDatabaseUser:***REDACTED***@mongodb0.example.com/" - "?authSource=admin&replicaSet=myRepl" - ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4309d46594..4a1fda063f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.0-dev12" # pragma: no cover +__version__ = "0.13.0-dev13" # pragma: no cover diff --git a/unstructured/ingest/cli/common.py b/unstructured/ingest/cli/common.py index cc8b058002..53dacafaf9 100644 --- a/unstructured/ingest/cli/common.py +++ b/unstructured/ingest/cli/common.py @@ -1,18 +1,7 @@ import logging -from unstructured.ingest.logger import ingest_log_streaming_init, logger - - -def options_redactions(options: dict) -> dict: - # handle any logic needed to redact not already caught by the logging filter - options = options.copy() - if "uri" in options and options["uri"].startswith("mongodb"): - from unstructured.ingest.connector.mongodb import redact - - options["uri"] = redact(options["uri"]) - return options +from unstructured.ingest.logger import ingest_log_streaming_init def log_options(options: dict, verbose=False): ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - logger.debug(f"options: {options_redactions(options)}") diff --git a/unstructured/ingest/connector/mongodb.py b/unstructured/ingest/connector/mongodb.py index e29ecf6211..ae73ecbec5 100644 --- a/unstructured/ingest/connector/mongodb.py +++ b/unstructured/ingest/connector/mongodb.py @@ -3,12 +3,12 @@ from dataclasses import dataclass, field from pathlib import Path -from dataclasses_json.core import Json - from unstructured.__version__ import __version__ as unstructured_version +from unstructured.ingest.enhanced_dataclass import enhanced_field from unstructured.ingest.enhanced_dataclass.core import _asdict from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError, WriteError from unstructured.ingest.interfaces import ( + AccessConfig, BaseConnectorConfig, BaseDestinationConnector, BaseIngestDocBatch, @@ -34,72 +34,29 @@ def parse_userinfo(userinfo: str) -> t.Tuple[str, str]: return user, passwd -def redact(uri: str, redacted_text="***REDACTED***") -> str: - """ - Cherry pick code from pymongo.uri_parser.parse_uri to only extract password and - redact without needing to import pymongo library - """ - - SCHEME = "mongodb://" - SRV_SCHEME = "mongodb+srv://" - if uri.startswith(SCHEME): - scheme_free = uri[len(SCHEME) :] # noqa: E203 - elif uri.startswith(SRV_SCHEME): - scheme_free = uri[len(SRV_SCHEME) :] # noqa: E203 - else: - raise ValueError(f"Invalid URI scheme: URI must begin with '{SCHEME}' or '{SRV_SCHEME}'") - - passwd = None - - host_part, _, path_part = scheme_free.partition("/") - if not host_part: - host_part = path_part - path_part = "" - - if not path_part: - # There was no slash in scheme_free, check for a sole "?". - host_part, _, _ = host_part.partition("?") - - if "@" in host_part: - userinfo, _, hosts = host_part.rpartition("@") - _, passwd = parse_userinfo(userinfo) - - if passwd: - uri = uri.replace(passwd, redacted_text) - return uri +@dataclass +class MongoDBAccessConfig(AccessConfig): + uri: t.Optional[str] = enhanced_field(sensitive=True, default=None) @dataclass class SimpleMongoDBConfig(BaseConnectorConfig): - uri: t.Optional[str] = None + access_config: MongoDBAccessConfig host: t.Optional[str] = None database: t.Optional[str] = None collection: t.Optional[str] = None port: int = 27017 batch_size: int = 100 - def to_dict( - self, redact_sensitive=False, redacted_text="***REDACTED***", **kwargs - ) -> t.Dict[str, Json]: - d = super().to_dict( - redact_sensitive=redact_sensitive, redacted_text=redacted_text, **kwargs - ) - if redact_sensitive: - if self.host: - d["host"] = redact(uri=self.host, redacted_text=redacted_text) - if self.uri: - d["uri"] = redact(uri=self.uri, redacted_text=redacted_text) - return d - @requires_dependencies(["pymongo"], extras="mongodb") def generate_client(self) -> "MongoClient": from pymongo import MongoClient from pymongo.driver_info import DriverInfo from pymongo.server_api import ServerApi - if self.uri: + if self.access_config.uri: return MongoClient( - self.uri, + self.access_config.uri, server_api=ServerApi(version=SERVER_API_VERSION), driver=DriverInfo(name="unstructured", version=unstructured_version), ) diff --git a/unstructured/ingest/runner/mongodb.py b/unstructured/ingest/runner/mongodb.py index 13cf0330d4..bdde249cde 100644 --- a/unstructured/ingest/runner/mongodb.py +++ b/unstructured/ingest/runner/mongodb.py @@ -17,7 +17,7 @@ class MongoDBRunner(Runner): def update_read_config(self): hashed_dir_name = hashlib.sha256( - str(self.connector_config.uri).encode("utf-8"), + str(self.connector_config.access_config.uri).encode("utf-8"), ) self.read_config.download_dir = update_download_dir_hash( connector_name="mongodb",