From eb2a437a1a21fc80cd2f3168fb2f53a5e1176b67 Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Wed, 15 May 2024 21:43:55 +0200 Subject: [PATCH 1/4] feat: simplify is_known function as an alias of DictionaryLookupStrategy --- simplemma/lemmatizer.py | 34 ++++++++-------------------------- tests/test_lemmatizer.py | 18 ++---------------- 2 files changed, 10 insertions(+), 42 deletions(-) diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py index 18e55d1..4bae121 100644 --- a/simplemma/lemmatizer.py +++ b/simplemma/lemmatizer.py @@ -80,30 +80,6 @@ def __init__( self._fallback_lemmatization_strategy = fallback_lemmatization_strategy self._cached_lemmatize = lru_cache(maxsize=cache_max_size)(self._lemmatize) - def is_known( - self, - token: str, - lang: Union[str, Tuple[str, ...]], - ) -> bool: - """Check if a token is known in the specified language(s). - - Args: - token: The token to check. - lang: The language or languages to check in. - - Returns: - bool: True if the token is known, False otherwise. - """ - - _control_input_type(token) - lang = validate_lang_input(lang) - - dictionary_lookup = DictionaryLookupStrategy() - return any( - dictionary_lookup.get_lemma(token, lang_code) is not None - for lang_code in lang - ) - def lemmatize( self, token: str, @@ -191,8 +167,14 @@ def is_known( Returns: bool: True if the token is known, False otherwise. """ - lemmatizer = _legacy_lemmatizer if not greedy else _legacy_greedy_lemmatizer - return lemmatizer.is_known(token, lang) + + _control_input_type(token) + lang = validate_lang_input(lang) + + dictionary_lookup = DictionaryLookupStrategy() + return any( + dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang + ) def lemmatize( diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py index fb27002..88a0899 100644 --- a/tests/test_lemmatizer.py +++ b/tests/test_lemmatizer.py @@ -452,27 +452,13 @@ def test_subwords() -> None: def test_is_known() -> None: - # logic - with pytest.raises(TypeError): - assert Lemmatizer().is_known(None, lang="en") is None # type: ignore[arg-type] with pytest.raises(TypeError): assert is_known(None, lang="en") is None # type: ignore[arg-type] - with pytest.raises(ValueError): - assert Lemmatizer().is_known("", lang="en") is None with pytest.raises(ValueError): assert is_known("", lang="en") is None - assert ( - Lemmatizer().is_known("FanCY", lang="en") - == is_known("FanCY", lang="en") - == True - ) - # known words - assert ( - Lemmatizer().is_known("Fancy-String", lang="en") - == is_known("Fancy-String", lang="en") - == False - ) + assert is_known("FanCY", lang="en") == True + assert is_known("Fancy-String", lang="en") == False def test_get_lemmas_in_text() -> None: From 92c477a5395c8b7e782170ae272435f792b78f58 Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Wed, 15 May 2024 21:45:30 +0200 Subject: [PATCH 2/4] fix: use _legacy_dictionary_factory in is_known check --- simplemma/lemmatizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py index 4bae121..91ad0b4 100644 --- a/simplemma/lemmatizer.py +++ b/simplemma/lemmatizer.py @@ -171,7 +171,7 @@ def is_known( _control_input_type(token) lang = validate_lang_input(lang) - dictionary_lookup = DictionaryLookupStrategy() + dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory) return any( dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang ) From 126a3a4cea31f0b3c438b5ca32e882c2b2f478cf Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Wed, 15 May 2024 21:47:05 +0200 Subject: [PATCH 3/4] fix: removed unused greedy option from is_known --- simplemma/lemmatizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py index 91ad0b4..9355768 100644 --- a/simplemma/lemmatizer.py +++ b/simplemma/lemmatizer.py @@ -155,9 +155,7 @@ def get_lemmas_in_text( ) -def is_known( - token: str, lang: Union[str, Tuple[str, ...]], greedy: bool = False -) -> bool: +def is_known(token: str, lang: Union[str, Tuple[str, ...]]) -> bool: """Check if a token is known in the specified language(s). Args: From a0bef86241ce1fb1e8d6b09bf33849f2430bbb46 Mon Sep 17 00:00:00 2001 From: Juanjo Diaz Date: Thu, 16 May 2024 20:43:05 +0200 Subject: [PATCH 4/4] test: add multilingual test for is_known function --- tests/test_lemmatizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py index 88a0899..caa26c1 100644 --- a/tests/test_lemmatizer.py +++ b/tests/test_lemmatizer.py @@ -460,6 +460,9 @@ def test_is_known() -> None: assert is_known("FanCY", lang="en") == True assert is_known("Fancy-String", lang="en") == False + assert is_known("espejos", lang=("es", "de")) == True + assert is_known("espejos", lang=("de", "es")) == True + def test_get_lemmas_in_text() -> None: # text lemmatization