Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Revert "Move glob_to_regex and re_word_boundary to matrix-python-common (#11505) #11527

Merged
merged 3 commits into from
Dec 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11527.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Temporarily revert usage of `matrix-python-common`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we nuke both changelogs after this merges so we don't add anything to the changes doc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I'll be cutting them out of the changes doc after.

3 changes: 1 addition & 2 deletions synapse/config/room_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@

from typing import List

from matrix_common.regex import glob_to_regex

from synapse.types import JsonDict
from synapse.util import glob_to_regex

from ._base import Config, ConfigError

Expand Down
3 changes: 1 addition & 2 deletions synapse/config/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
import os
from typing import List, Optional, Pattern

from matrix_common.regex import glob_to_regex

from OpenSSL import SSL, crypto
from twisted.internet._sslverify import Certificate, trustRootFromCertificates

from synapse.config._base import Config, ConfigError
from synapse.util import glob_to_regex

logger = logging.getLogger(__name__)

Expand Down
3 changes: 1 addition & 2 deletions synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
Union,
)

from matrix_common.regex import glob_to_regex
from prometheus_client import Counter, Gauge, Histogram

from twisted.internet import defer
Expand Down Expand Up @@ -67,7 +66,7 @@
)
from synapse.storage.databases.main.lock import Lock
from synapse.types import JsonDict, get_domain_from_id
from synapse.util import json_decoder, unwrapFirstError
from synapse.util import glob_to_regex, json_decoder, unwrapFirstError
from synapse.util.async_helpers import Linearizer, concurrently_execute
from synapse.util.caches.response_cache import ResponseCache
from synapse.util.stringutils import parse_server_name
Expand Down
7 changes: 3 additions & 4 deletions synapse/push/push_rule_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@
import re
from typing import Any, Dict, List, Optional, Pattern, Tuple, Union

from matrix_common.regex import glob_to_regex, to_word_pattern

from synapse.events import EventBase
from synapse.types import JsonDict, UserID
from synapse.util import glob_to_regex, re_word_boundary
from synapse.util.caches.lrucache import LruCache

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -185,7 +184,7 @@ def _contains_display_name(self, display_name: Optional[str]) -> bool:
r = regex_cache.get((display_name, False, True), None)
if not r:
r1 = re.escape(display_name)
r1 = to_word_pattern(r1)
r1 = re_word_boundary(r1)
r = re.compile(r1, flags=re.IGNORECASE)
regex_cache[(display_name, False, True)] = r

Expand Down Expand Up @@ -214,7 +213,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
try:
r = regex_cache.get((glob, True, word_boundary), None)
if not r:
r = glob_to_regex(glob, word_boundary=word_boundary)
r = glob_to_regex(glob, word_boundary)
regex_cache[(glob, True, word_boundary)] = r
return bool(r.search(value))
except re.error:
Expand Down
1 change: 0 additions & 1 deletion synapse/python_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@
# with the latest security patches.
"cryptography>=3.4.7",
"ijson>=3.1",
"matrix-common==1.0.0",
]

CONDITIONAL_REQUIREMENTS = {
Expand Down
59 changes: 58 additions & 1 deletion synapse/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@

import json
import logging
import re
import typing
from typing import Any, Callable, Dict, Generator, Optional
from typing import Any, Callable, Dict, Generator, Optional, Pattern

import attr
from frozendict import frozendict
Expand All @@ -34,6 +35,9 @@
logger = logging.getLogger(__name__)


_WILDCARD_RUN = re.compile(r"([\?\*]+)")


def _reject_invalid_json(val: Any) -> None:
"""Do not allow Infinity, -Infinity, or NaN values in JSON."""
raise ValueError("Invalid JSON value: '%s'" % val)
Expand Down Expand Up @@ -181,3 +185,56 @@ def log_failure(
if not consumeErrors:
return failure
return None


def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern:
"""Converts a glob to a compiled regex object.

Args:
glob: pattern to match
word_boundary: If True, the pattern will be allowed to match at word boundaries
anywhere in the string. Otherwise, the pattern is anchored at the start and
end of the string.

Returns:
compiled regex pattern
"""

# Patterns with wildcards must be simplified to avoid performance cliffs
# - The glob `?**?**?` is equivalent to the glob `???*`
# - The glob `???*` is equivalent to the regex `.{3,}`
chunks = []
for chunk in _WILDCARD_RUN.split(glob):
# No wildcards? re.escape()
if not _WILDCARD_RUN.match(chunk):
chunks.append(re.escape(chunk))
continue

# Wildcards? Simplify.
qmarks = chunk.count("?")
if "*" in chunk:
chunks.append(".{%d,}" % qmarks)
else:
chunks.append(".{%d}" % qmarks)

res = "".join(chunks)

if word_boundary:
res = re_word_boundary(res)
else:
# \A anchors at start of string, \Z at end of string
res = r"\A" + res + r"\Z"

return re.compile(res, re.IGNORECASE)


def re_word_boundary(r: str) -> str:
"""
Adds word boundary characters to the start and end of an
expression to require that the match occur as a whole word,
but do so respecting the fact that strings starting or ending
with non-word characters will change word boundaries.
"""
# we can't use \b as it chokes on unicode. however \W seems to be okay
# as shorthand for [^0-9A-Za-z_].
return r"(^|\W)%s(\W|$)" % (r,)
59 changes: 59 additions & 0 deletions tests/util/test_glob_to_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.util import glob_to_regex

from tests.unittest import TestCase


class GlobToRegexTestCase(TestCase):
def test_literal_match(self):
"""patterns without wildcards should match"""
pat = glob_to_regex("foobaz")
self.assertTrue(
pat.match("FoobaZ"), "patterns should match and be case-insensitive"
)
self.assertFalse(
pat.match("x foobaz"), "pattern should not match at word boundaries"
)

def test_wildcard_match(self):
pat = glob_to_regex("f?o*baz")

self.assertTrue(
pat.match("FoobarbaZ"),
"* should match string and pattern should be case-insensitive",
)
self.assertTrue(pat.match("foobaz"), "* should match 0 characters")
self.assertFalse(pat.match("fooxaz"), "the character after * must match")
self.assertFalse(pat.match("fobbaz"), "? should not match 0 characters")
self.assertFalse(pat.match("fiiobaz"), "? should not match 2 characters")

def test_multi_wildcard(self):
"""patterns with multiple wildcards in a row should match"""
pat = glob_to_regex("**baz")
self.assertTrue(pat.match("agsgsbaz"), "** should match any string")
self.assertTrue(pat.match("baz"), "** should match the empty string")
self.assertEqual(pat.pattern, r"\A.{0,}baz\Z")

pat = glob_to_regex("*?baz")
self.assertTrue(pat.match("agsgsbaz"), "*? should match any string")
self.assertTrue(pat.match("abaz"), "*? should match a single char")
self.assertFalse(pat.match("baz"), "*? should not match the empty string")
self.assertEqual(pat.pattern, r"\A.{1,}baz\Z")

pat = glob_to_regex("a?*?*?baz")
self.assertTrue(pat.match("a g baz"), "?*?*? should match 3 chars")
self.assertFalse(pat.match("a..baz"), "?*?*? should not match 2 chars")
self.assertTrue(pat.match("a.gg.baz"), "?*?*? should match 4 chars")
self.assertEqual(pat.pattern, r"\Aa.{3,}baz\Z")