Skip to content

Commit

Permalink
generalize filter for sanitizers
Browse files Browse the repository at this point in the history
  • Loading branch information
biswajit-k committed Mar 11, 2023
1 parent a8bedb6 commit 6f6f637
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 77 deletions.
13 changes: 3 additions & 10 deletions nominatim/tokenizer/sanitizers/clean_housenumbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
import re

from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.data.place_name import PlaceName
Expand All @@ -34,12 +33,10 @@
class _HousenumberSanitizer:

def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter_kind('housenumber')
self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
self.split_regexp = config.get_delimiter()

nameregexps = config.get_string_list('convert-to-name', [])
self.is_name_regexp = [re.compile(r) for r in nameregexps]

self.filter_name = config.get_filter('convert-to-name', [])


def __call__(self, obj: ProcessInfo) -> None:
Expand All @@ -49,7 +46,7 @@ def __call__(self, obj: ProcessInfo) -> None:
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item.kind):
if self._treat_as_name(item.name):
if self.filter_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
Expand All @@ -76,10 +73,6 @@ def _regularize(self, hnr: str) -> Iterator[str]:
yield hnr


def _treat_as_name(self, housenumber: str) -> bool:
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)


def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""
Expand Down
53 changes: 35 additions & 18 deletions nominatim/tokenizer/sanitizers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Seque
Arguments:
param: Name of the configuration parameter.
default: Value to return, when the parameter is missing.
default: Takes a tuple or list of strings which will
be returned if the parameter is missing in the
sanitizer configuration.
Note that if this default parameter is not
provided then an empty list is returned.
Returns:
If the parameter value is a simple string, it is returned as a
Expand All @@ -44,7 +48,7 @@ def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Seque
values = self.data.get(param, None)

if values is None:
return None if default is None else list(default)
return list(default)

if isinstance(values, str):
return [values] if values else []
Expand Down Expand Up @@ -102,30 +106,43 @@ def get_delimiter(self, default: str = ',;') -> Pattern[str]:
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))


def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
""" Return a filter function for the name kind from the 'filter-kind'
config parameter.
def get_filter(self, param: str, default: Optional[Sequence[str]] = None
) -> Callable[[Optional[str]], bool]:
""" Returns a filter function for the given parameter of the sanitizer
configuration.
If the 'filter-kind' parameter is empty, the filter lets all items
pass. If the parameter is a string, it is interpreted as a single
regular expression that must match the full kind string.
If the parameter is a list then
any of the regular expressions in the list must match to pass.
The value provided for the parameter in sanitizer configuration
should be a string or list of strings, where each string is a regular
expression. These regular expressions will later be used by the
filter function to filter strings.
Arguments:
default: Filters to be used, when the 'filter-kind' parameter
is not specified. If omitted then the default is to
let all names pass.
param: The parameter for which the filter function
will be created.
default: Takes a list of strings where each string
is a regular expression.These regular expressions will
be used by the filter function in case the given parameter
is missing in the sanitizer configuration.
Note that if this default is not provided, the filter function
lets all target strings pass, including None type.
Returns:
A filter function which takes a name string and returns
True when the item passes the filter.
A filter function that takes a target string as the argument and
returns True if the it fully matches any of the regular expressions
otherwise returns False.
Note that if instead of target string, None type is provided then
it simply returns False.
"""
filters = self.get_string_list('filter-kind', default)
values = self.data.get(param, None)
filters = self.get_string_list(param) if values is not None else default

if not filters:
if filters is None:
return lambda _: True

regexes = [re.compile(regex) for regex in filters]

return lambda name: any(regex.fullmatch(name) for regex in regexes)
return (
lambda target: any(regex.fullmatch(target) for regex in regexes)
if target is not None
else False
)
50 changes: 19 additions & 31 deletions nominatim/tokenizer/sanitizers/delete_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@
"""
from typing import Callable, List, Optional, Pattern, Tuple, Sequence
import re
from typing import Callable, List, Tuple, Sequence

from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.data.place_name import PlaceName
Expand All @@ -65,37 +64,37 @@ class _TagSanitizer:

def __init__(self, config: SanitizerConfig) -> None:
self.type = config.get('type', 'name')
self.filter_kind = config.get_filter_kind()
self.filter_kind = config.get_filter('filter-kind')
self.country_codes = config.get_string_list('country_code', [])
self.allowed_ranks = self._set_allowed_ranks( \
config.get_string_list('rank_address', ['0-30']))
self.filter_suffix = config.get_filter('suffix')
self.filter_name = config.get_filter('name')
self.allowed_ranks = self._set_allowed_ranks(
config.get_string_list("rank_address", ["0-30"])
)

self.has_country_code = config.get('country_code', None) is not None

suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])
self.suffix_regexp = [re.compile(r) for r in suffixregexps]

nameregexps = config.get_string_list('name', [r'[\s\S]*'])
self.name_regexp = [re.compile(r) for r in nameregexps]



def __call__(self, obj: ProcessInfo) -> None:
tags = obj.names if self.type == 'name' else obj.address

if (not tags or
self.has_country_code and
obj.place.country_code not in self.country_codes or
not self.allowed_ranks[obj.place.rank_address]):
if (
not tags
or not self.allowed_ranks[obj.place.rank_address]
or self.has_country_code
and obj.place.country_code not in self.country_codes
):
return

filtered_tags: List[PlaceName] = []

for tag in tags:

if (not self.filter_kind(tag.kind) or
not self._matches(tag.suffix, self.suffix_regexp) or
not self._matches(tag.name, self.name_regexp)):
if (
not self.filter_kind(tag.kind)
or not self.filter_suffix(tag.suffix)
or not self.filter_name(tag.name)
):
filtered_tags.append(tag)


Expand All @@ -117,7 +116,7 @@ def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
for rank in ranks:
intvl = [int(x) for x in rank.split('-')]

start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])
start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]

for i in range(start, end + 1):
allowed_ranks[i] = True
Expand All @@ -126,17 +125,6 @@ def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
return tuple(allowed_ranks)


def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:
""" Returns True if the given value fully matches any of the regular
expression pattern in the list. Otherwise, returns False.
Note that if the value is None, it is taken as an empty string.
"""
target = '' if value is None else value
return any(r.fullmatch(target) is not None for r in patterns)



def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function to process removal of certain tags.
"""
Expand Down
2 changes: 1 addition & 1 deletion nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class _AnalyzerByLanguage:
"""

def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter_kind()
self.filter_kind = config.get_filter('filter-kind')
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')

Expand Down
6 changes: 3 additions & 3 deletions test/python/tokenizer/sanitizers/test_delete_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):

def test_string_arguments_pass(self):
res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
name_xyz='foo', ref_pqr='foo', name_abc='bar', ref_abc='baz')

assert res == []

Expand All @@ -302,7 +302,7 @@ def test_string_arguments_fail(self):

def test_list_arguments_pass(self):
res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
name_xyz='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')

assert res == []

Expand All @@ -315,7 +315,7 @@ def test_list_arguments_fail(self):

def test_mix_arguments_pass(self):
res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
name_abc='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')

assert res == []

Expand Down
49 changes: 35 additions & 14 deletions test/python/tokenizer/sanitizers/test_sanitizer_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ def test_string_list_default_empty():
assert SanitizerConfig().get_string_list('op') == []


def test_string_list_default_none():
assert SanitizerConfig().get_string_list('op', default=None) is None


def test_string_list_default_something():
assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']

Expand Down Expand Up @@ -78,36 +74,61 @@ def test_create_split_regex_empty_delimiter():
regex = SanitizerConfig({'delimiters': ''}).get_delimiter()


@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
def test_create_kind_filter_no_params(inp):
filt = SanitizerConfig().get_filter_kind()
@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', None))
def test_create_name_filter_no_param_no_default(inp):
filt = SanitizerConfig({'filter-kind': 'place'}).get_filter('name')

assert filt(inp)


@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
def test_create_kind_filter_default_positive(kind):
filt = SanitizerConfig({'name': 'abc'}).get_filter('filter-kind', ['.*de'])

assert filt(kind)

@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende', None))
def test_create_kind_filter_default_negetive(kind):
filt = SanitizerConfig().get_filter('filter-kind', ['.*fr'])

assert not filt(kind)


@pytest.mark.parametrize("kind", ("lang", "lang:de", "langxx"))
def test_create_kind_filter_custom_regex_positive(kind):
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
filt = SanitizerConfig(
{
"name": "abc",
"filter-kind": "lang.*",
}
).get_filter("filter-kind", [".*fr"])

assert filt(kind)


@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
@pytest.mark.parametrize("kind", ("de ", "123", "", "bedece", None))
def test_create_kind_filter_custom_regex_negative(kind):
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
filt = SanitizerConfig({"filter-kind": ".*de"}).get_filter("filter-kind")

assert not filt(kind)


@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
@pytest.mark.parametrize("kind", ("name", "fr", "name:fr", "frfr", "34"))
def test_create_kind_filter_many_positive(kind):
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
filt = SanitizerConfig(
{
"name": "abc",
"filter-kind": [".*fr", "name", r"\d+"],
}
).get_filter("filter-kind")

assert filt(kind)


@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
@pytest.mark.parametrize("kind", ("name:de", "fridge", "a34", ".*", "\\", None))
def test_create_kind_filter_many_negative(kind):
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
filt = SanitizerConfig({"filter-kind": [".*fr", "name", r"\d+"]}
).get_filter("filter-kind", [r"[\s\S]*"])


assert not filt(kind)

0 comments on commit 6f6f637

Please sign in to comment.