Skip to content

Commit

Permalink
feat: Replace langdetect with lingua
Browse files Browse the repository at this point in the history
TASK: IL-416
  • Loading branch information
MerlinKallenbornAA committed Jun 7, 2024
1 parent 7a4f19f commit 80959de
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 16 deletions.
90 changes: 81 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ nbconvert = "^7.16.4"
datasets = "^2.19.2"
jupyter = "^1.0.0"
requests = "^2.32.3"
langdetect = "^1.0.9"
pycountry = "24.6.1"
opentelemetry-api = "^1.22.0"
opentelemetry-sdk = "^1.22.0"
Expand All @@ -34,6 +33,7 @@ opentelemetry-exporter-otlp-proto-http = "1.23.0"
# summary grader
rouge-score = "^0.1.2"
sacrebleu = "^2.4.2"
lingua-language-detector = "^2.0.2"

[tool.poetry.group.dev.dependencies]
# lint & format
Expand Down
38 changes: 33 additions & 5 deletions src/intelligence_layer/core/detect_language.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from dataclasses import dataclass
from typing import Mapping, Optional, Sequence, TypeVar

from langdetect import detect_langs # type: ignore
from lingua import ConfidenceValue, IsoCode639_1
from lingua import Language as LinguaLanguage
from lingua import LanguageDetectorBuilder
from pycountry import languages
from pydantic import BaseModel

Expand Down Expand Up @@ -34,6 +36,11 @@ def language_config(self, configs: Mapping["Language", Config]) -> Config:
)
return config

def to_lingua_language(self) -> LinguaLanguage:
iso_code = getattr(IsoCode639_1, self.iso_639_1.upper())
language = LinguaLanguage.from_iso_code_639_1(iso_code)
return language


class DetectLanguageInput(BaseModel):
"""The input for a `DetectLanguage` task.
Expand Down Expand Up @@ -90,28 +97,49 @@ class DetectLanguage(Task[DetectLanguageInput, DetectLanguageOutput]):
>>> output = task.run(input, InMemoryTracer())
"""

AVAILABLE_LANGUAGES = [
LinguaLanguage.GERMAN,
LinguaLanguage.ENGLISH,
LinguaLanguage.ITALIAN,
LinguaLanguage.FRENCH,
LinguaLanguage.SPANISH,
]

def __init__(self, threshold: float = 0.5):
super().__init__()
self._threshold = threshold

self._detector = LanguageDetectorBuilder.from_languages(
*self.AVAILABLE_LANGUAGES
).build()

def do_run(
self, input: DetectLanguageInput, task_span: TaskSpan
) -> DetectLanguageOutput:
annotated_languages = self._detect_languages(input, task_span)
best_fit = self._get_best_fit(annotated_languages, input.possible_languages)
return DetectLanguageOutput(best_fit=best_fit)

return DetectLanguageOutput(best_fit=best_fit if best_fit is not None else None)

def _detect_languages(
self, input: DetectLanguageInput, task_span: TaskSpan
) -> Sequence[AnnotatedLanguage]:
languages = detect_langs(input.text)
determined_languages = self._detector.compute_language_confidence_values(
input.text
)

annotated_languages = [
AnnotatedLanguage(lang=Language(lang.lang), prob=lang.prob)
for lang in languages
AnnotatedLanguage(
lang=Language(iso_639_1=self._to_iso_639_1_code(lang)), prob=lang.value
)
for lang in determined_languages
]
task_span.log("Raw language probabilities", annotated_languages)
return annotated_languages

def _to_iso_639_1_code(self, lingua_with_confidence: ConfidenceValue) -> str:
return str(lingua_with_confidence.language.iso_code_639_1.name).lower()

def _get_best_fit(
self,
languages_result: Sequence[AnnotatedLanguage],
Expand Down
12 changes: 11 additions & 1 deletion tests/core/test_detect_language.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from lingua import Language as LinguaLanguage

from intelligence_layer.core import (
DetectLanguage,
Expand Down Expand Up @@ -47,7 +48,7 @@ def test_detect_language_returns_correct_language(
assert output.best_fit == expected_language


def test_detect_language_returns_non_if_no_language_can_be_detected() -> None:
def test_detect_language_returns_none_if_no_language_can_be_detected() -> None:
text = "Je m’appelle Jessica. Je suis une fille, je suis française et j’ai treize ans." # codespell:ignore
task = DetectLanguage()
input = DetectLanguageInput(
Expand All @@ -58,3 +59,12 @@ def test_detect_language_returns_non_if_no_language_can_be_detected() -> None:
output = task.run(input, tracer)

assert output.best_fit is None


def test_conversion_to_lingua_works() -> None:
language: Language = Language("de")
expected_language: LinguaLanguage = LinguaLanguage.GERMAN

converted_language = language.to_lingua_language()

assert converted_language == expected_language

0 comments on commit 80959de

Please sign in to comment.