Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add optional ICU support for user search #14464

Merged
merged 12 commits into from
Dec 12, 2022
Merged
1 change: 1 addition & 0 deletions changelog.d/14464.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve user search for international display names.
7 changes: 7 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium

* New dependency on libicu-dev to provide improved results for user
search.

-- Synapse Packaging team <[email protected]> Tue, 06 Dec 2022 15:28:10 +0000

matrix-synapse-py3 (1.73.0) stable; urgency=medium

* New Synapse release 1.73.0.
Expand Down
2 changes: 2 additions & 0 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Build-Depends:
dh-virtualenv (>= 1.1),
libsystemd-dev,
libpq-dev,
libicu-dev,
pkg-config,
lsb-release,
python3-dev,
python3,
Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ RUN \
zlib1g-dev \
git \
curl \
libicu-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*


Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile-dhvirtualenv
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
python3-venv \
sqlite3 \
libpq-dev \
libicu-dev \
pkg-config \
xmlsec1

# Install rust and ensure it's in the PATH
Expand Down
16 changes: 14 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
Pympler = { version = "*", optional = true }
parameterized = { version = ">=0.7.4", optional = true }
idna = { version = ">=2.5", optional = true }
pyicu = { version = ">=2.10.2", optional = true }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this is the latest version of the package, which can be a point of friction for downstream packagers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I might have just used whatever I was using at the time I implemented this change (there doesn't seem to be any recent fix to packaging or typing on that package which would justify otherwise) so it should be fine to relax this constraint if necessary.


[tool.poetry.extras]
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
Expand All @@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
# Required to use experimental `caches.track_memory_usage` config option.
cache-memory = ["pympler"]
test = ["parameterized", "idna"]
# Allows for better search for international characters in the user directory. This
# requires libicu's development headers installed on the system (e.g. libicu-dev on
# Debian-based distributions).
user-search = ["pyicu"]

# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
Expand Down Expand Up @@ -261,6 +266,8 @@ all = [
"txredisapi", "hiredis",
# cache-memory
"pympler",
# improved user search
"pyicu",
# omitted:
# - test: it's useful to have this separate from dev deps in the olddeps job
# - systemd: this is a system-based requirement
Expand Down
25 changes: 25 additions & 0 deletions stubs/icu.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2022 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Stub for PyICU.

class Locale:
@staticmethod
def getDefault() -> Locale: ...

class BreakIterator:
@staticmethod
def createWordInstance(locale: Locale) -> BreakIterator: ...
def setText(self, text: str) -> None: ...
def nextBoundary(self) -> int: ...
67 changes: 63 additions & 4 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
cast,
)

try:
# Figure out if ICU support is available for searching users.
import icu

USE_ICU = True
except ModuleNotFoundError:
USE_ICU = False

from typing_extensions import TypedDict

from synapse.api.errors import StoreError
Expand Down Expand Up @@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
"""

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
results = _parse_words(search_term)
return " & ".join("(%s* OR %s)" % (result, result) for result in results)


Expand All @@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
results = _parse_words(search_term)

both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)

return both, exact, prefix


def _parse_words(search_term: str) -> List[str]:
"""Split the provided search string into a list of its words.

If support for ICU (International Components for Unicode) is available, use it.
Otherwise, fall back to using a regex to detect word boundaries. This latter
solution works well enough for most latin-based languages, but doesn't work as well
with other languages.

Args:
search_term: The search string.

Returns:
A list of the words in the search string.
"""
if USE_ICU:
return _parse_words_with_icu(search_term)

return re.findall(r"([\w\-]+)", search_term, re.UNICODE)


def _parse_words_with_icu(search_term: str) -> List[str]:
"""Break down the provided search string into its individual words using ICU
(International Components for Unicode).

Args:
search_term: The search string.

Returns:
A list of the words in the search string.
"""
results = []
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
breaker.setText(search_term)
i = 0
while True:
j = breaker.nextBoundary()
if j < 0:
break

result = search_term[i:j]

# libicu considers spaces and punctuation between words as words, but we don't
# want to include those in results as they would result in syntax errors in SQL
# queries (e.g. "foo bar" would result in the search query including "foo & &
# bar").
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
results.append(result)
Comment on lines +971 to +976
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This'll consider - a word. Is that intentional?

Copy link
Contributor

@squahtx squahtx Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I realise it's taken from the previous code, so we've probably not regressed anything here.

Copy link
Contributor

@squahtx squahtx Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When handling a word with a hyphen, postgres constructs the tsvector as follows:

synapse=# SELECT to_tsvector('simple', 'co-operative');
              to_tsvector
---------------------------------------
 'co':2 'co-operative':1 'operative':3
(1 row)

While we search for the following words:

>>> _parse_words_with_icu("co-operative")
['co', '-', 'operative']

Which to_tsquery interprets as:

synapse=# SELECT to_tsquery('simple', 'co & - & operative');
     to_tsquery
--------------------
 'co' & 'operative'
(1 row)

which should match. I'm not sure what happens on sqlite, but I'm less concerned there.

Note that previously, we'd search for:

synapse=# SELECT to_tsquery('simple', 'co-operative');
             to_tsquery
-------------------------------------
 'co-operative' & 'co' & 'operative'
(1 row)


i = j

return results
43 changes: 43 additions & 0 deletions tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import Any, Dict, Set, Tuple
from unittest import mock
from unittest.mock import Mock, patch
Expand All @@ -30,6 +31,12 @@
from tests.test_utils.event_injection import inject_member_event
from tests.unittest import HomeserverTestCase, override_config

try:
import icu
except ImportError:
icu = None # type: ignore


ALICE = "@alice:a"
BOB = "@bob:b"
BOBBY = "@bobby:a"
Expand Down Expand Up @@ -467,3 +474,39 @@ def test_search_user_dir_stop_words(self) -> None:
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)


class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
skip = "Requires PyICU"

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.user_dir_helper = GetUserDirectoryTables(self.store)

def test_icu_word_boundary(self) -> None:
"""Tests that we correctly detect word boundaries when ICU (International
Components for Unicode) support is available.
"""

display_name = "Gáo"

# This word is not broken down correctly by Python's regular expressions,
# likely because á is actually a lowercase a followed by a U+0301 combining
# acute accent. This is specifically something that ICU support fixes.
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
self.assertEqual(len(matches), 2)

self.get_success(
self.store.update_profile_in_user_dir(ALICE, display_name, None)
)
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))

# Check that searching for this user yields the correct result.
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
self.assertFalse(r["limited"])
self.assertEqual(len(r["results"]), 1)
self.assertDictEqual(
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)