From 6f57edffea799086bbb66da1a02eb8d50e628a8a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 31 May 2024 14:22:20 +0100 Subject: [PATCH 1/4] Handle punctuation in user dir search properly We want searches such as "user-1" to find users with ID of `@user-1:example.com`. --- .../storage/databases/main/user_directory.py | 64 +++++++++++++++++-- tests/handlers/test_user_directory.py | 40 ++++++++++++ tests/storage/test_user_directory.py | 4 ++ 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 0513e7dc06e..863c46e16ee 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -1281,7 +1281,7 @@ def _parse_words_with_regex(search_term: str) -> List[str]: Break down search term into words, when we don't have ICU available. See: `_parse_words` """ - return re.findall(r"([\w\-]+)", search_term, re.UNICODE) + return re.findall(r"([\w-]+)", search_term, re.UNICODE) def _parse_words_with_icu(search_term: str) -> List[str]: @@ -1303,15 +1303,67 @@ def _parse_words_with_icu(search_term: str) -> List[str]: if j < 0: break - result = search_term[i:j] + result = search_term[i:j].strip() + results.append(result) + + i = j + + # libicu will break up words that have punctuation in them, but to handle + # cases where user IDs have '-', '.' and '_' in them we want to *not* break + # those into words and instead allow the DB to tokenise them how it wants. + # + # In particular, user-71 in postgres gets tokenised to "user, -71", and this + # will not match a query for "user, 71". + new_results: List[str] = [] + i = 0 + while i < len(results): + curr = results[i] + + prev = None + next = None + if i > 0: + prev = results[i - 1] + if i + 1 < len(results): + next = results[i + 1] + + i += 1 # libicu considers spaces and punctuation between words as words, but we don't # want to include those in results as they would result in syntax errors in SQL # queries (e.g. "foo bar" would result in the search query including "foo & & # bar"). - if len(re.findall(r"([\w\-]+)", result, re.UNICODE)): - results.append(result) + if not curr: + continue + + if curr in ["-", ".", "_"]: + prefix = "" + suffix = "" + + # Check if the next item is a word, and if so use it as the suffix. + # We check for if its a word as we don't want to concatenate + # multiple punctuation marks. + if next is not None and re.match(r"\w", next): + suffix = next + i += 1 # We're using next, so we skip it in the outer loop. + else: + # We want to avoid creating terms like "user-", as we should + # strip trailing punctuation. + continue - i = j + if prev and re.match(r"\w", prev) and new_results: + prefix = new_results[-1] + new_results.pop() + + # We might not have a prefix here, but that's fine as we want to + # ensure that we don't strip preceding punctuation e.g. '-71' + # shouldn't be converted to '71'. + + new_results.append(f"{prefix}{curr}{suffix}") + continue + elif not re.match(r"\w", curr): + # Ignore other punctuation + continue + + new_results.append(curr) - return results + return new_results diff --git a/tests/handlers/test_user_directory.py b/tests/handlers/test_user_directory.py index 77c6cac4497..b9122e1577f 100644 --- a/tests/handlers/test_user_directory.py +++ b/tests/handlers/test_user_directory.py @@ -21,6 +21,7 @@ from unittest.mock import AsyncMock, Mock, patch from urllib.parse import quote + from twisted.test.proto_helpers import MemoryReactor import synapse.rest.admin @@ -1061,6 +1062,45 @@ def test_ignore_display_names_with_null_codepoints(self) -> None: {alice: ProfileInfo(display_name=None, avatar_url=MXC_DUMMY)}, ) + def test_search_punctuation(self) -> None: + """Test that you can search for a user that includes punctuation""" + + searching_user = self.register_user("searcher", "password") + searching_user_tok = self.login("searcher", "password") + + room_id = self.helper.create_room_as( + searching_user, + room_version=RoomVersions.V1.identifier, + tok=searching_user_tok, + ) + + # We want to test searching for users of the form e.g. "user-1", with + # various punctuation. We also test both where the prefix is numeric and + # alphanumeric, as e.g. postgres tokenises "user-1" as "user" and "-1". + i = 1 + for char in ["-", ".", "_"]: + for use_numeric in [False, True]: + if use_numeric: + prefix1 = f"{i}" + prefix2 = f"{i+1}" + else: + prefix1 = f"a{i}" + prefix2 = f"a{i+1}" + + local_user_1 = self.register_user(f"user{char}{prefix1}", "password") + local_user_2 = self.register_user(f"user{char}{prefix2}", "password") + + self._add_user_to_room(room_id, RoomVersions.V1, local_user_1) + self._add_user_to_room(room_id, RoomVersions.V1, local_user_2) + + results = self.get_success( + self.handler.search_users(searching_user, local_user_1, 20) + )["results"] + received_user_id_ordering = [result["user_id"] for result in results] + self.assertSequenceEqual(received_user_id_ordering[:1], [local_user_1]) + + i += 2 + class TestUserDirSearchDisabled(unittest.HomeserverTestCase): servlets = [ diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 156a610faa9..c26932069f8 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -711,6 +711,10 @@ def test_icu_word_boundary_punctuation(self) -> None: ), ) + self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"]) + self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"]) + self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"]) + def test_regex_word_boundary_punctuation(self) -> None: """ Tests the behaviour of punctuation with the non-ICU tokeniser From 67fdd88b2f3fe58a9f500352a1bf7445883d4c2e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 Jun 2024 11:09:32 +0100 Subject: [PATCH 2/4] Newsfile --- changelog.d/17254.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/17254.bugfix diff --git a/changelog.d/17254.bugfix b/changelog.d/17254.bugfix new file mode 100644 index 00000000000..b0d61309e2c --- /dev/null +++ b/changelog.d/17254.bugfix @@ -0,0 +1 @@ +Fix searching for users with their exact localpart whose ID includes a hyphen. From d6245fd6b8da5fc64bc4ec46bf6209e14a8fbcf7 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 Jun 2024 11:13:05 +0100 Subject: [PATCH 3/4] Lint --- tests/handlers/test_user_directory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/handlers/test_user_directory.py b/tests/handlers/test_user_directory.py index b9122e1577f..878d9683b6a 100644 --- a/tests/handlers/test_user_directory.py +++ b/tests/handlers/test_user_directory.py @@ -21,7 +21,6 @@ from unittest.mock import AsyncMock, Mock, patch from urllib.parse import quote - from twisted.test.proto_helpers import MemoryReactor import synapse.rest.admin From 39ae391c5f0925ef7b4516093278108e47a1505a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 4 Jun 2024 13:57:05 +0100 Subject: [PATCH 4/4] Handle more punctuation properly --- synapse/storage/databases/main/user_directory.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 863c46e16ee..6e18f714d75 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -1303,8 +1303,10 @@ def _parse_words_with_icu(search_term: str) -> List[str]: if j < 0: break - result = search_term[i:j].strip() - results.append(result) + # We want to make sure that we split on `@` and `:` specifically, as + # they occur in user IDs. + for result in re.split(r"[@:]+", search_term[i:j]): + results.append(result.strip()) i = j