Skip to content

Commit

Permalink
replace cld based language detector
Browse files Browse the repository at this point in the history
  • Loading branch information
miku committed May 14, 2024
1 parent 52a2e10 commit 585037b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 14 deletions.
24 changes: 15 additions & 9 deletions siskin/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@

from siskin.utils import URLCache

try:
from lingua import LanguageDetectorBuilder

language_detector = (
LanguageDetectorBuilder.from_all_languages()
.with_preloaded_language_models()
.build()
)
except ImportError:
language_detector = None

html_escape_table = {'"': """, "'": "'"}
html_unescape_table = {v: k for k, v in html_escape_table.items()}

Expand Down Expand Up @@ -396,15 +407,10 @@ def find_osf_language(doc, with_default="eng"):
"""
if not attrs.get("description"):
return with_default
try:
import cld3
from iso639 import languages

result = cld3.get_language(attrs["description"])
if not result.is_reliable:
return with_default
return languages.get(alpha2=result.language).part2b
except (ImportError, KeyError):
if language_detector is not None:
lang = language_detector.detect_language_of(attrs["description"])
return lang.iso_code_639_3.name.lower()
else:
return with_default

def fetch_authors(doc, force=False, best_effort=False, max_retries=5):
Expand Down
10 changes: 5 additions & 5 deletions siskin/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,11 @@ def test_osf_to_intermediate():
"pernyataan Allah m",
"authors": [{"rft.aufirst": "Ceria", "rft.aulast": "Ceria"}],
"doi": "10.31219/osf.io/egcsk",
"finc.format": "Article",
"finc.id": "ai-191-egcsk",
"finc.mega_collection": ["sid-191-col-osf", "Osf"],
"finc.source_id": "191",
"languages": ["eng"],
"finc.format": "Preprint",
"finc.id": "ai-179-egcsk",
"finc.mega_collection": ["sid-179-col-osf", "Osf"],
"finc.source_id": "179",
"languages": ["ind"],
"rft.atitle": "Konsep Allah Dalam Teologi Proses",
"rft.date": "2021-07-19",
"rft.genre": "article",
Expand Down

0 comments on commit 585037b

Please sign in to comment.