Skip to content

Commit

Permalink
Fix: MongoDB connector URI password redaction, basic unit tests for G…
Browse files Browse the repository at this point in the history
…it connector (#2268)

MongoDB connector:
Issue:
[MongoDB
documentation](https://www.mongodb.com/docs/manual/reference/connection-string/)
states that characters `$ : / ? # [ ] @` must be percent encoded. URI
with password containing such special character will not be redacted.

Fix:
This fix removes usage of `unquote_plus` on password which allows
detected password to match with one inside URI and successfully replace
it.

Git connector:
Added very basic unit tests for repository filtering methods. Their
impact is rather minimal but showcases current limitation in
`is_file_type_supported` method.
  • Loading branch information
jakub-sandomierz-deepsense-ai authored Jan 8, 2024
1 parent e65a44e commit 0ca154a
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
### Fixes

* **Fix unequal row-length in HTMLTable.text_as_html.** Fixes to other aspects of partition_html() in v0.11 allowed unequal cell-counts in table rows. Make the cells in each row correspond 1:1 with cells in the original table row. This fix also removes "noise" cells resulting from HTML-formatting whitespace and eliminates the "column-shifting" of cells that previously resulted from noise-cells.
* **Fix MongoDB connector URI password redaction.** MongoDB documentation states that characters `$ : / ? # [ ] @` must be percent encoded. URIs with password containing such special character were not redacted.

## 0.11.8

Expand Down
16 changes: 16 additions & 0 deletions test_unstructured_ingest/unit/test_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from unstructured.ingest.cli.common import options_redactions


def test_options_redactions():
given_options = {
"uri": "mongodb+srv://myDatabaseUser:D1fficultP%[email protected]/"
"?authSource=admin&replicaSet=myRepl"
}

when_options = options_redactions(options=given_options)

assert given_options["uri"] != when_options["uri"]
assert (
when_options["uri"] == "mongodb+srv://myDatabaseUser:***REDACTED***@mongodb0.example.com/"
"?authSource=admin&replicaSet=myRepl"
)
61 changes: 61 additions & 0 deletions test_unstructured_ingest/unit/test_connector_git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from pathlib import Path

import pytest

from unstructured.ingest.connector.git import GitAccessConfig, GitSourceConnector, SimpleGitConfig


@pytest.mark.parametrize(
("given_file_path", "then_is_supported"),
[
(Path("src/submodule/document.md"), True),
(Path("src/submodule/document.txt"), True),
(Path("src/submodule/document.pdf"), True),
(Path("src/submodule/document.doc"), True),
(Path("src/submodule/document.docx"), True),
(Path("src/submodule/document.eml"), True),
(Path("src/submodule/document.html"), True),
(Path("src/submodule/document.png"), True),
(Path("src/submodule/document.jpg"), True),
(Path("src/submodule/document.ppt"), True),
(Path("src/submodule/document.pptx"), True),
(Path("src/submodule/document.xml"), True),
(Path("src/submodule/code.py"), False),
(Path("src/submodule/Dockerfile"), False),
(Path("src/submodule/Makefile"), False),
(Path("src/submodule/LICENSE"), False),
],
)
def test_connector_supports_file(given_file_path, then_is_supported):
when_is_supported = GitSourceConnector.is_file_type_supported(str(given_file_path))

assert when_is_supported == then_is_supported


class FakeGitSourceConnectorImpl(GitSourceConnector):
def get_ingest_docs(self):
pass


@pytest.mark.parametrize(
("given_file_path", "given_file_glob", "then_matches_glob"),
[
(Path("LICENSE"), None, True),
(Path("Makefile"), ["Makefile"], True),
(Path("src/my/super/module/main.py"), ["**/*.py"], True),
(Path("src/my/super/module/main.pyc"), ["**/*.py"], False),
],
)
def test_connector_does_path_match_glob(given_file_path, given_file_glob, then_matches_glob):
connector_config = SimpleGitConfig(
url="some_fake_url",
access_config=GitAccessConfig(access_token="some_fake_token"),
file_glob=given_file_glob,
)
connector = FakeGitSourceConnectorImpl(
processor_config=None, read_config=None, connector_config=connector_config
)

when_matches_glob = connector.does_path_match_glob(str(given_file_path))

assert when_matches_glob == then_matches_glob
3 changes: 2 additions & 1 deletion unstructured/ingest/connector/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def initialize(self):
def check_connection(self):
pass

def is_file_type_supported(self, path: str) -> bool:
@staticmethod
def is_file_type_supported(path: str) -> bool:
# Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files
# TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc.
supported = path.endswith(
Expand Down
3 changes: 1 addition & 2 deletions unstructured/ingest/connector/mongodb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import typing as t
from dataclasses import dataclass, field
from urllib.parse import unquote_plus

from dataclasses_json.core import Json

Expand All @@ -24,7 +23,7 @@

def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
user, _, passwd = userinfo.partition(":")
return unquote_plus(user), unquote_plus(passwd)
return user, passwd


def redact(uri: str, redacted_text="***REDACTED***") -> str:
Expand Down

0 comments on commit 0ca154a

Please sign in to comment.