-
-
Notifications
You must be signed in to change notification settings - Fork 717
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3610 from lonvia/search-preprocessing
Add configurable query preprocessing
- Loading branch information
Showing
13 changed files
with
302 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
query-preprocessing: | ||
- step: normalize | ||
normalization: | ||
- ":: lower ()" | ||
- ":: Hans-Hant" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2024 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
Common data types and protocols for preprocessing. | ||
""" | ||
from typing import List, Callable | ||
|
||
from ..typing import Protocol | ||
from ..search import query as qmod | ||
from .config import QueryConfig | ||
|
||
QueryProcessingFunc = Callable[[List[qmod.Phrase]], List[qmod.Phrase]] | ||
|
||
|
||
class QueryHandler(Protocol): | ||
""" Protocol for query modules. | ||
""" | ||
def create(self, config: QueryConfig) -> QueryProcessingFunc: | ||
""" | ||
Create a function for sanitizing a place. | ||
Arguments: | ||
config: A dictionary with the additional configuration options | ||
specified in the tokenizer configuration | ||
normalizer: A instance to transliterate text | ||
Return: | ||
The result is a list modified by the preprocessor. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2024 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
Configuration for Sanitizers. | ||
""" | ||
from typing import Any, TYPE_CHECKING | ||
from collections import UserDict | ||
|
||
# working around missing generics in Python < 3.8 | ||
# See https://github.com/python/typing/issues/60#issuecomment-869757075 | ||
if TYPE_CHECKING: | ||
_BaseUserDict = UserDict[str, Any] | ||
else: | ||
_BaseUserDict = UserDict | ||
|
||
|
||
class QueryConfig(_BaseUserDict): | ||
""" The `QueryConfig` class is a read-only dictionary | ||
with configuration options for the preprocessor. | ||
In addition to the usual dictionary functions, the class provides | ||
accessors to standard preprocessor options that are used by many of the | ||
preprocessors. | ||
""" | ||
|
||
def set_normalizer(self, normalizer: Any) -> 'QueryConfig': | ||
""" Set the normalizer function to be used. | ||
""" | ||
self['_normalizer'] = normalizer | ||
|
||
return self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2024 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
Normalize query text using the same ICU normalization rules that are | ||
applied during import. If a phrase becomes empty because the normalization | ||
removes all terms, then the phrase is deleted. | ||
This preprocessor does not come with any extra information. Instead it will | ||
use the configuration from the `normalization` section. | ||
""" | ||
from typing import cast | ||
|
||
from .config import QueryConfig | ||
from .base import QueryProcessingFunc | ||
from ..search.query import Phrase | ||
|
||
|
||
def create(config: QueryConfig) -> QueryProcessingFunc: | ||
normalizer = config.get('_normalizer') | ||
|
||
if not normalizer: | ||
return lambda p: p | ||
|
||
return lambda phrases: list( | ||
filter(lambda p: p.text, | ||
(Phrase(p.ptype, cast(str, normalizer.transliterate(p.text))) | ||
for p in phrases))) |
Oops, something went wrong.