From e5f7aa141201211183e0b2f2eb5d25d4ccb194ba Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 26 Aug 2022 16:39:18 +0300 Subject: [PATCH] Use Lingua instead of pycld3 for language detection. Fixes #593 --- annif/transform/__init__.py | 2 +- annif/transform/langfilter.py | 14 ++++++++++---- pyproject.toml | 4 ++-- tests/test_transform_langfilter.py | 1 - 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index 61b9a6c59..995fce172 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -48,4 +48,4 @@ def get_transform(transform_specs, project): _transforms.update({langfilter.LangFilter.name: langfilter.LangFilter}) except ImportError: annif.logger.debug( - "pycld3 not available, not enabling filter_language transform") + "Lingua not available, not enabling filter_language transform") diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index f63f6f97a..68f6ef9ec 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -2,7 +2,7 @@ different from the language of the project.""" import annif -import cld3 +import lingua from . import transform logger = annif.logger @@ -16,14 +16,20 @@ def __init__(self, project, text_min_length=500, sentence_min_length=50): super().__init__(project) self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) + self.detector = ( + lingua.LanguageDetectorBuilder + .from_all_languages() + .with_low_accuracy_mode() + .build() + ) def _detect_language(self, text): """Tries to detect the language of a text input. Outputs a BCP-47-style language code (e.g. 'en').""" - lan_info = cld3.get_language(text) - if lan_info is not None and lan_info.is_reliable: - return lan_info.language + lan_info = self.detector.detect_language_of(text) + if lan_info is not None: + return lan_info.iso_code_639_1.name.lower() else: return None diff --git a/pyproject.toml b/pyproject.toml index 52e1d4a5d..a93c7e2e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ tensorflow-cpu = {version = "2.9.1", optional = true} lmdb = {version = "1.3.0", optional = true} omikuji = {version = "0.5.*", optional = true} yake = {version = "0.4.5", optional = true} -pycld3 = {version = "*", optional = true} +lingua-language-detector = {version = "1.1.1", optional = true} spacy = {version = "3.3.*", optional = true} [tool.poetry.dev-dependencies] @@ -79,7 +79,7 @@ voikko = ["voikko"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] -pycld3 = ["pycld3"] +lingua = ["lingua-language-detector"] spacy = ["spacy"] [tool.poetry.scripts] diff --git a/tests/test_transform_langfilter.py b/tests/test_transform_langfilter.py index b3a2f2d90..3dade8c0b 100644 --- a/tests/test_transform_langfilter.py +++ b/tests/test_transform_langfilter.py @@ -32,7 +32,6 @@ def test_lang_filter(project): Kansalliskirjasto on kaikille avoin kulttuuriperintöorganisaatio, joka palvelee valtakunnallisesti kansalaisia, tiedeyhteisöjä ja muita yhteiskunnan toimijoita. - Abc defghij klmnopqr stuwxyz abc defghij klmnopqr stuwxyz. Turvaamme Suomessa julkaistun tai Suomea koskevan julkaistun kulttuuriperinnön saatavuuden sekä välittämme ja tuotamme tietosisältöjä tutkimukselle, opiskelulle, kansalaisille ja