From f6504b2ca2359cff9dbdb959068ba1e37b26b43f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 26 Aug 2022 16:39:18 +0300 Subject: [PATCH] Use Lingua instead of pycld3 for language detection. Fixes #593 --- annif/transform/__init__.py | 2 +- annif/transform/langfilter.py | 14 ++++++++++---- setup.py | 2 +- tests/test_transform_langfilter.py | 1 - 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/annif/transform/__init__.py b/annif/transform/__init__.py index 61b9a6c59..995fce172 100644 --- a/annif/transform/__init__.py +++ b/annif/transform/__init__.py @@ -48,4 +48,4 @@ def get_transform(transform_specs, project): _transforms.update({langfilter.LangFilter.name: langfilter.LangFilter}) except ImportError: annif.logger.debug( - "pycld3 not available, not enabling filter_language transform") + "Lingua not available, not enabling filter_language transform") diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py index f63f6f97a..68f6ef9ec 100644 --- a/annif/transform/langfilter.py +++ b/annif/transform/langfilter.py @@ -2,7 +2,7 @@ different from the language of the project.""" import annif -import cld3 +import lingua from . import transform logger = annif.logger @@ -16,14 +16,20 @@ def __init__(self, project, text_min_length=500, sentence_min_length=50): super().__init__(project) self.text_min_length = int(text_min_length) self.sentence_min_length = int(sentence_min_length) + self.detector = ( + lingua.LanguageDetectorBuilder + .from_all_languages() + .with_low_accuracy_mode() + .build() + ) def _detect_language(self, text): """Tries to detect the language of a text input. Outputs a BCP-47-style language code (e.g. 'en').""" - lan_info = cld3.get_language(text) - if lan_info is not None and lan_info.is_reliable: - return lan_info.language + lan_info = self.detector.detect_language_of(text) + if lan_info is not None: + return lan_info.iso_code_639_1.name.lower() else: return None diff --git a/setup.py b/setup.py index 3d279f3b1..5c1139a7a 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def read(fname): 'nn': ['tensorflow-cpu==2.9.1', 'lmdb==1.3.0'], 'omikuji': ['omikuji==0.5.*'], 'yake': ['yake==0.4.5'], - 'pycld3': ['pycld3'], + 'lingua': ['lingua-language-detector==1.1.1'], 'spacy': ['spacy==3.3.*'], 'dev': [ 'codecov', diff --git a/tests/test_transform_langfilter.py b/tests/test_transform_langfilter.py index b3a2f2d90..3dade8c0b 100644 --- a/tests/test_transform_langfilter.py +++ b/tests/test_transform_langfilter.py @@ -32,7 +32,6 @@ def test_lang_filter(project): Kansalliskirjasto on kaikille avoin kulttuuriperintöorganisaatio, joka palvelee valtakunnallisesti kansalaisia, tiedeyhteisöjä ja muita yhteiskunnan toimijoita. - Abc defghij klmnopqr stuwxyz abc defghij klmnopqr stuwxyz. Turvaamme Suomessa julkaistun tai Suomea koskevan julkaistun kulttuuriperinnön saatavuuden sekä välittämme ja tuotamme tietosisältöjä tutkimukselle, opiskelulle, kansalaisille ja