generalize filter for sanitizers

osm-search · Mar 11, 2023 · 6f6f637 · 6f6f637
1 parent a8bedb6
commit 6f6f637
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 77 deletions.
diff --git a/nominatim/tokenizer/sanitizers/clean_housenumbers.py b/nominatim/tokenizer/sanitizers/clean_housenumbers.py
@@ -25,7 +25,6 @@
                      expression that must match the full house number value.
 """
 from typing import Callable, Iterator, List
-import re
 
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.data.place_name import PlaceName
@@ -34,12 +33,10 @@
 class _HousenumberSanitizer:
 
     def __init__(self, config: SanitizerConfig) -> None:
-        self.filter_kind = config.get_filter_kind('housenumber')
+        self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
         self.split_regexp = config.get_delimiter()
 
-        nameregexps = config.get_string_list('convert-to-name', [])
-        self.is_name_regexp = [re.compile(r) for r in nameregexps]
-
+        self.filter_name = config.get_filter('convert-to-name', [])
 
 
     def __call__(self, obj: ProcessInfo) -> None:
@@ -49,7 +46,7 @@ def __call__(self, obj: ProcessInfo) -> None:
         new_address: List[PlaceName] = []
         for item in obj.address:
             if self.filter_kind(item.kind):
-                if self._treat_as_name(item.name):
+                if self.filter_name(item.name):
                     obj.names.append(item.clone(kind='housenumber'))
                 else:
                     new_address.extend(item.clone(kind='housenumber', name=n)
@@ -76,10 +73,6 @@ def _regularize(self, hnr: str) -> Iterator[str]:
         yield hnr
 
 
-    def _treat_as_name(self, housenumber: str) -> bool:
-        return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
-
-
 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a housenumber processing function.
     """

diff --git a/nominatim/tokenizer/sanitizers/config.py b/nominatim/tokenizer/sanitizers/config.py
@@ -33,7 +33,11 @@ def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Seque
 
             Arguments:
                 param: Name of the configuration parameter.
-                default: Value to return, when the parameter is missing.
+                default: Takes a tuple or list of strings which will
+                         be returned if the parameter is missing in the
+                         sanitizer configuration.
+                         Note that if this default parameter is not
+                         provided then an empty list is returned.
 
             Returns:
                 If the parameter value is a simple string, it is returned as a
@@ -44,7 +48,7 @@ def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Seque
         values = self.data.get(param, None)
 
         if values is None:
-            return None if default is None else list(default)
+            return list(default)
 
         if isinstance(values, str):
             return [values] if values else []
@@ -102,30 +106,43 @@ def get_delimiter(self, default: str = ',;') -> Pattern[str]:
         return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
 
 
-    def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
-        """ Return a filter function for the name kind from the 'filter-kind'
-            config parameter.
+    def get_filter(self, param: str, default: Optional[Sequence[str]] = None
+            ) -> Callable[[Optional[str]], bool]:
+        """ Returns a filter function for the given parameter of the sanitizer
+            configuration.
 
-            If the 'filter-kind' parameter is empty, the filter lets all items
-            pass. If the parameter is a string, it is interpreted as a single
-            regular expression that must match the full kind string.
-            If the parameter is a list then
-            any of the regular expressions in the list must match to pass.
+            The value provided for the parameter in sanitizer configuration
+            should be a string or list of strings, where each string is a regular
+            expression. These regular expressions will later be used by the
+            filter function to filter strings.
 
             Arguments:
-                default: Filters to be used, when the 'filter-kind' parameter
-                         is not specified. If omitted then the default is to
-                         let all names pass.
+                param: The parameter for which the filter function
+                       will be created.
+                default: Takes a list of strings where each string
+                         is a regular expression.These regular expressions will
+                         be used by the filter function in case the given parameter
+                         is missing in the sanitizer configuration.
+                         Note that if this default is not provided, the filter function
+                         lets all target strings pass, including None type.
 
             Returns:
-                A filter function which takes a name string and returns
-                True when the item passes the filter.
+                A filter function that takes a target string as the argument and
+                returns True if the it fully matches any of the regular expressions
+                otherwise returns False.
+                Note that if instead of target string, None type is provided then
+                it simply returns False.
         """
-        filters = self.get_string_list('filter-kind', default)
+        values = self.data.get(param, None)
+        filters = self.get_string_list(param) if values is not None else default
 
-        if not filters:
+        if filters is None:
             return lambda _: True
 
         regexes = [re.compile(regex) for regex in filters]
 
-        return lambda name: any(regex.fullmatch(name) for regex in regexes)
+        return (
+            lambda target: any(regex.fullmatch(target) for regex in regexes)
+            if target is not None
+            else False
+        )
diff --git a/nominatim/tokenizer/sanitizers/delete_tags.py b/nominatim/tokenizer/sanitizers/delete_tags.py
@@ -54,8 +54,7 @@
 
 
 """
-from typing import Callable, List, Optional, Pattern, Tuple, Sequence
-import re
+from typing import Callable, List, Tuple, Sequence
 
 from nominatim.tokenizer.sanitizers.base import ProcessInfo
 from nominatim.data.place_name import PlaceName
@@ -65,37 +64,37 @@ class _TagSanitizer:
 
     def __init__(self, config: SanitizerConfig) -> None:
         self.type = config.get('type', 'name')
-        self.filter_kind = config.get_filter_kind()
+        self.filter_kind = config.get_filter('filter-kind')
         self.country_codes = config.get_string_list('country_code', [])
-        self.allowed_ranks = self._set_allowed_ranks( \
-                                            config.get_string_list('rank_address', ['0-30']))
+        self.filter_suffix = config.get_filter('suffix')
+        self.filter_name = config.get_filter('name')
+        self.allowed_ranks = self._set_allowed_ranks(
+            config.get_string_list("rank_address", ["0-30"])
+        )
 
         self.has_country_code = config.get('country_code', None) is not None
 
-        suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])
-        self.suffix_regexp = [re.compile(r) for r in suffixregexps]
-
-        nameregexps = config.get_string_list('name', [r'[\s\S]*'])
-        self.name_regexp = [re.compile(r) for r in nameregexps]
-
-
 
     def __call__(self, obj: ProcessInfo) -> None:
         tags = obj.names if self.type == 'name' else obj.address
 
-        if (not tags or
-             self.has_country_code and
-              obj.place.country_code not in self.country_codes or
-               not self.allowed_ranks[obj.place.rank_address]):
+        if (
+            not tags
+            or not self.allowed_ranks[obj.place.rank_address]
+            or self.has_country_code
+            and obj.place.country_code not in self.country_codes
+        ):
             return
 
         filtered_tags: List[PlaceName] = []
 
         for tag in tags:
 
-            if (not self.filter_kind(tag.kind) or
-                  not self._matches(tag.suffix, self.suffix_regexp) or
-                    not self._matches(tag.name, self.name_regexp)):
+            if (
+                not self.filter_kind(tag.kind)
+                or not self.filter_suffix(tag.suffix)
+                or not self.filter_name(tag.name)
+            ):
                 filtered_tags.append(tag)
 
 
@@ -117,7 +116,7 @@ def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
         for rank in ranks:
             intvl = [int(x) for x in rank.split('-')]
 
-            start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])
+            start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]
 
             for i in range(start, end + 1):
                 allowed_ranks[i] = True
@@ -126,17 +125,6 @@ def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
         return tuple(allowed_ranks)
 
 
-    def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:
-        """ Returns True if the given value fully matches any of the regular
-            expression pattern in the list. Otherwise, returns False.
-
-            Note that if the value is None, it is taken as an empty string.
-        """
-        target = '' if value is None else value
-        return any(r.fullmatch(target) is not None for r in patterns)
-
-
-
 def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
     """ Create a function to process removal of certain tags.
     """

diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py
@@ -41,7 +41,7 @@ class _AnalyzerByLanguage:
     """
 
     def __init__(self, config: SanitizerConfig) -> None:
-        self.filter_kind = config.get_filter_kind()
+        self.filter_kind = config.get_filter('filter-kind')
         self.replace = config.get('mode', 'replace') != 'append'
         self.whitelist = config.get('whitelist')
 

diff --git a/test/python/tokenizer/sanitizers/test_delete_tags.py b/test/python/tokenizer/sanitizers/test_delete_tags.py
@@ -289,7 +289,7 @@ def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):
 
     def test_string_arguments_pass(self):
         res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',
-                                    name='foo', ref='foo', name_abc='bar', ref_abc='baz')
+                                    name_xyz='foo', ref_pqr='foo', name_abc='bar', ref_abc='baz')
 
         assert res == []
 
@@ -302,7 +302,7 @@ def test_string_arguments_fail(self):
 
     def test_list_arguments_pass(self):
         res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],
-                                    name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
+                                    name_xyz='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
 
         assert res == []
 
@@ -315,7 +315,7 @@ def test_list_arguments_fail(self):
 
     def test_mix_arguments_pass(self):
         res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',
-                                    name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
+                                    name_abc='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
 
         assert res == []
 

diff --git a/test/python/tokenizer/sanitizers/test_sanitizer_config.py b/test/python/tokenizer/sanitizers/test_sanitizer_config.py
@@ -17,10 +17,6 @@ def test_string_list_default_empty():
     assert SanitizerConfig().get_string_list('op') == []
 
 
-def test_string_list_default_none():
-    assert SanitizerConfig().get_string_list('op', default=None) is None
-
-
 def test_string_list_default_something():
     assert SanitizerConfig().get_string_list('op', default=['a', 'b']) == ['a', 'b']
 
@@ -78,36 +74,61 @@ def test_create_split_regex_empty_delimiter():
         regex = SanitizerConfig({'delimiters': ''}).get_delimiter()
 
 
-@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*'))
-def test_create_kind_filter_no_params(inp):
-    filt = SanitizerConfig().get_filter_kind()
+@pytest.mark.parametrize('inp', ('name', 'name:de', 'na\\me', '.*', None))
+def test_create_name_filter_no_param_no_default(inp):
+    filt = SanitizerConfig({'filter-kind': 'place'}).get_filter('name')
 
     assert filt(inp)
 
 
 @pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
+def test_create_kind_filter_default_positive(kind):
+    filt = SanitizerConfig({'name': 'abc'}).get_filter('filter-kind', ['.*de'])
+
+    assert filt(kind)
+
+@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende', None))
+def test_create_kind_filter_default_negetive(kind):
+    filt = SanitizerConfig().get_filter('filter-kind', ['.*fr'])
+
+    assert not filt(kind)
+
+
+@pytest.mark.parametrize("kind", ("lang", "lang:de", "langxx"))
 def test_create_kind_filter_custom_regex_positive(kind):
-    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+    filt = SanitizerConfig(
+        {
+            "name": "abc",
+            "filter-kind": "lang.*",
+        }
+    ).get_filter("filter-kind", [".*fr"])
 
     assert filt(kind)
 
 
-@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
+@pytest.mark.parametrize("kind", ("de ", "123", "", "bedece", None))
 def test_create_kind_filter_custom_regex_negative(kind):
-    filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
+    filt = SanitizerConfig({"filter-kind": ".*de"}).get_filter("filter-kind")
 
     assert not filt(kind)
 
 
-@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
+@pytest.mark.parametrize("kind", ("name", "fr", "name:fr", "frfr", "34"))
 def test_create_kind_filter_many_positive(kind):
-    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+    filt = SanitizerConfig(
+        {
+            "name": "abc",
+            "filter-kind": [".*fr", "name", r"\d+"],
+        }
+    ).get_filter("filter-kind")
 
     assert filt(kind)
 
 
-@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
+@pytest.mark.parametrize("kind", ("name:de", "fridge", "a34", ".*", "\\", None))
 def test_create_kind_filter_many_negative(kind):
-    filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
+    filt = SanitizerConfig({"filter-kind": [".*fr", "name", r"\d+"]}
+    ).get_filter("filter-kind", [r"[\s\S]*"])
+
 
     assert not filt(kind)