Skip to content

Commit

Permalink
Use Lingua instead of pycld3 for language detection. Fixes #593
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Sep 23, 2022
1 parent ec10014 commit e5f7aa1
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 8 deletions.
2 changes: 1 addition & 1 deletion annif/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ def get_transform(transform_specs, project):
_transforms.update({langfilter.LangFilter.name: langfilter.LangFilter})
except ImportError:
annif.logger.debug(
"pycld3 not available, not enabling filter_language transform")
"Lingua not available, not enabling filter_language transform")
14 changes: 10 additions & 4 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
different from the language of the project."""

import annif
import cld3
import lingua
from . import transform

logger = annif.logger
Expand All @@ -16,14 +16,20 @@ def __init__(self, project, text_min_length=500, sentence_min_length=50):
super().__init__(project)
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.detector = (
lingua.LanguageDetectorBuilder
.from_all_languages()
.with_low_accuracy_mode()
.build()
)

def _detect_language(self, text):
"""Tries to detect the language of a text input. Outputs a BCP-47-style
language code (e.g. 'en')."""

lan_info = cld3.get_language(text)
if lan_info is not None and lan_info.is_reliable:
return lan_info.language
lan_info = self.detector.detect_language_of(text)
if lan_info is not None:
return lan_info.iso_code_639_1.name.lower()
else:
return None

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ tensorflow-cpu = {version = "2.9.1", optional = true}
lmdb = {version = "1.3.0", optional = true}
omikuji = {version = "0.5.*", optional = true}
yake = {version = "0.4.5", optional = true}
pycld3 = {version = "*", optional = true}
lingua-language-detector = {version = "1.1.1", optional = true}
spacy = {version = "3.3.*", optional = true}

[tool.poetry.dev-dependencies]
Expand All @@ -79,7 +79,7 @@ voikko = ["voikko"]
nn = ["tensorflow-cpu", "lmdb"]
omikuji = ["omikuji"]
yake = ["yake"]
pycld3 = ["pycld3"]
lingua = ["lingua-language-detector"]
spacy = ["spacy"]

[tool.poetry.scripts]
Expand Down
1 change: 0 additions & 1 deletion tests/test_transform_langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def test_lang_filter(project):
Kansalliskirjasto on kaikille avoin kulttuuriperintöorganisaatio, joka
palvelee valtakunnallisesti kansalaisia, tiedeyhteisöjä ja muita
yhteiskunnan toimijoita.
Abc defghij klmnopqr stuwxyz abc defghij klmnopqr stuwxyz.
Turvaamme Suomessa julkaistun tai Suomea koskevan julkaistun
kulttuuriperinnön saatavuuden sekä välittämme ja tuotamme
tietosisältöjä tutkimukselle, opiskelulle, kansalaisille ja
Expand Down

0 comments on commit e5f7aa1

Please sign in to comment.