Skip to content

Commit

Permalink
Use Lingua instead of pycld3 for language detection. Fixes #593
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Aug 26, 2022
1 parent dea7d51 commit f6504b2
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 7 deletions.
2 changes: 1 addition & 1 deletion annif/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ def get_transform(transform_specs, project):
_transforms.update({langfilter.LangFilter.name: langfilter.LangFilter})
except ImportError:
annif.logger.debug(
"pycld3 not available, not enabling filter_language transform")
"Lingua not available, not enabling filter_language transform")
14 changes: 10 additions & 4 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
different from the language of the project."""

import annif
import cld3
import lingua
from . import transform

logger = annif.logger
Expand All @@ -16,14 +16,20 @@ def __init__(self, project, text_min_length=500, sentence_min_length=50):
super().__init__(project)
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.detector = (
lingua.LanguageDetectorBuilder
.from_all_languages()
.with_low_accuracy_mode()
.build()
)

def _detect_language(self, text):
"""Tries to detect the language of a text input. Outputs a BCP-47-style
language code (e.g. 'en')."""

lan_info = cld3.get_language(text)
if lan_info is not None and lan_info.is_reliable:
return lan_info.language
lan_info = self.detector.detect_language_of(text)
if lan_info is not None:
return lan_info.iso_code_639_1.name.lower()
else:
return None

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def read(fname):
'nn': ['tensorflow-cpu==2.9.1', 'lmdb==1.3.0'],
'omikuji': ['omikuji==0.5.*'],
'yake': ['yake==0.4.5'],
'pycld3': ['pycld3'],
'lingua': ['lingua-language-detector==1.1.1'],
'spacy': ['spacy==3.3.*'],
'dev': [
'codecov',
Expand Down
1 change: 0 additions & 1 deletion tests/test_transform_langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def test_lang_filter(project):
Kansalliskirjasto on kaikille avoin kulttuuriperintöorganisaatio, joka
palvelee valtakunnallisesti kansalaisia, tiedeyhteisöjä ja muita
yhteiskunnan toimijoita.
Abc defghij klmnopqr stuwxyz abc defghij klmnopqr stuwxyz.
Turvaamme Suomessa julkaistun tai Suomea koskevan julkaistun
kulttuuriperinnön saatavuuden sekä välittämme ja tuotamme
tietosisältöjä tutkimukselle, opiskelulle, kansalaisille ja
Expand Down

0 comments on commit f6504b2

Please sign in to comment.