matrix-org · babolivier · Dec 12, 2022 · Nov 16, 2022 · Dec 6, 2022 · Dec 6, 2022
@@ -0,0 +1 @@
+Improve user search for international display names.
@@ -1,3 +1,10 @@
+matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium
+
+  * New dependency on libicu-dev to provide improved results for user
+    search.
+
+ -- Synapse Packaging team <[email protected]>  Tue, 06 Dec 2022 15:28:10 +0000
+
 matrix-synapse-py3 (1.73.0) stable; urgency=medium
 
   * New Synapse release 1.73.0.

@@ -8,6 +8,8 @@ Build-Depends:
  dh-virtualenv (>= 1.1),
  libsystemd-dev,
  libpq-dev,
+ libicu-dev,
+ pkg-config,
  lsb-release,
  python3-dev,
  python3,

@@ -97,6 +97,8 @@ RUN \
     zlib1g-dev \
     git \
     curl \
+    libicu-dev \
+    pkg-config \
     && rm -rf /var/lib/apt/lists/*
 
 

@@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
         python3-venv \
         sqlite3 \
         libpq-dev \
+        libicu-dev \
+        pkg-config \
         xmlsec1
 
 # Install rust and ensure it's in the PATH

@@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
 Pympler = { version = "*", optional = true }
 parameterized = { version = ">=0.7.4", optional = true }
 idna = { version = ">=2.5", optional = true }
+pyicu = { version = ">=2.10.2", optional = true }
 
 [tool.poetry.extras]
 # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
@@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
 # Required to use experimental `caches.track_memory_usage` config option.
 cache-memory = ["pympler"]
 test = ["parameterized", "idna"]
+# Allows for better search for international characters in the user directory. This
+# requires libicu's development headers installed on the system (e.g. libicu-dev on
+# Debian-based distributions).
+user-search = ["pyicu"]
 
 # The duplication here is awful. I hate hate hate hate hate it. However, for now I want
 # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
@@ -261,6 +266,8 @@ all = [
     "txredisapi", "hiredis",
     # cache-memory
     "pympler",
+    # improved user search
+    "pyicu",
     # omitted:
     #   - test: it's useful to have this separate from dev deps in the olddeps job
     #   - systemd: this is a system-based requirement

@@ -0,0 +1,25 @@
+# Copyright 2022 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Stub for PyICU.
+
+class Locale:
+    @staticmethod
+    def getDefault() -> Locale: ...
+
+class BreakIterator:
+    @staticmethod
+    def createWordInstance(locale: Locale) -> BreakIterator: ...
+    def setText(self, text: str) -> None: ...
+    def nextBoundary(self) -> int: ...
@@ -26,6 +26,14 @@
     cast,
 )
 
+try:
+    # Figure out if ICU support is available for searching users.
+    import icu
+
+    USE_ICU = True
+except ModuleNotFoundError:
+    USE_ICU = False
+
 from typing_extensions import TypedDict
 
 from synapse.api.errors import StoreError
@@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
     """
 
     # Pull out the individual words, discarding any non-word characters.
-    results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+    results = _parse_words(search_term)
     return " & ".join("(%s* OR %s)" % (result, result) for result in results)
 
 
@@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
     We use this so that we can add prefix matching, which isn't something
     that is supported by default.
     """
-
-    # Pull out the individual words, discarding any non-word characters.
-    results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+    results = _parse_words(search_term)
 
     both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
     exact = " & ".join("%s" % (result,) for result in results)
     prefix = " & ".join("%s:*" % (result,) for result in results)
 
     return both, exact, prefix
+
+
+def _parse_words(search_term: str) -> List[str]:
+    """Split the provided search string into a list of its words.
+
+    If support for ICU (International Components for Unicode) is available, use it.
+    Otherwise, fall back to using a regex to detect word boundaries. This latter
+    solution works well enough for most latin-based languages, but doesn't work as well
+    with other languages.
+
+    Args:
+        search_term: The search string.
+
+    Returns:
+        A list of the words in the search string.
+    """
+    if USE_ICU:
+        return _parse_words_with_icu(search_term)
+
+    return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+
+
+def _parse_words_with_icu(search_term: str) -> List[str]:
+    """Break down the provided search string into its individual words using ICU
+    (International Components for Unicode).
+
+    Args:
+        search_term: The search string.
+
+    Returns:
+        A list of the words in the search string.
+    """
+    results = []
+    breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
+    breaker.setText(search_term)
+    i = 0
+    while True:
+        j = breaker.nextBoundary()
+        if j < 0:
+            break
+
+        result = search_term[i:j]
+
+        # libicu considers spaces and punctuation between words as words, but we don't
+        # want to include those in results as they would result in syntax errors in SQL
+        # queries (e.g. "foo bar" would result in the search query including "foo &  &
+        # bar").
+        if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
+            results.append(result)
+
+        i = j
+
+    return results
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Any, Dict, Set, Tuple
 from unittest import mock
 from unittest.mock import Mock, patch
@@ -30,6 +31,12 @@
 from tests.test_utils.event_injection import inject_member_event
 from tests.unittest import HomeserverTestCase, override_config
 
+try:
+    import icu
+except ImportError:
+    icu = None  # type: ignore
+
+
 ALICE = "@alice:a"
 BOB = "@bob:b"
 BOBBY = "@bobby:a"
@@ -467,3 +474,39 @@ def test_search_user_dir_stop_words(self) -> None:
             r["results"][0],
             {"user_id": BELA, "display_name": "Bela", "avatar_url": None},
         )
+
+
+class UserDirectoryICUTestCase(HomeserverTestCase):
+    if not icu:
+        skip = "Requires PyICU"
+
+    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
+        self.store = hs.get_datastores().main
+        self.user_dir_helper = GetUserDirectoryTables(self.store)
+
+    def test_icu_word_boundary(self) -> None:
+        """Tests that we correctly detect word boundaries when ICU (International
+        Components for Unicode) support is available.
+        """
+
+        display_name = "Gáo"
+
+        # This word is not broken down correctly by Python's regular expressions,
+        # likely because á is actually a lowercase a followed by a U+0301 combining
+        # acute accent. This is specifically something that ICU support fixes.
+        matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
+        self.assertEqual(len(matches), 2)
+
+        self.get_success(
+            self.store.update_profile_in_user_dir(ALICE, display_name, None)
+        )
+        self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
+
+        # Check that searching for this user yields the correct result.
+        r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
+        self.assertFalse(r["limited"])
+        self.assertEqual(len(r["results"]), 1)
+        self.assertDictEqual(
+            r["results"][0],
+            {"user_id": ALICE, "display_name": display_name, "avatar_url": None},
+        )