diff --git a/siskin/conversions.py b/siskin/conversions.py index 052f1942..b97403d6 100644 --- a/siskin/conversions.py +++ b/siskin/conversions.py @@ -50,6 +50,17 @@ from siskin.utils import URLCache +try: + from lingua import LanguageDetectorBuilder + + language_detector = ( + LanguageDetectorBuilder.from_all_languages() + .with_preloaded_language_models() + .build() + ) +except ImportError: + language_detector = None + html_escape_table = {'"': """, "'": "'"} html_unescape_table = {v: k for k, v in html_escape_table.items()} @@ -396,15 +407,10 @@ def find_osf_language(doc, with_default="eng"): """ if not attrs.get("description"): return with_default - try: - import cld3 - from iso639 import languages - - result = cld3.get_language(attrs["description"]) - if not result.is_reliable: - return with_default - return languages.get(alpha2=result.language).part2b - except (ImportError, KeyError): + if language_detector is not None: + lang = language_detector.detect_language_of(attrs["description"]) + return lang.iso_code_639_3.name.lower() + else: return with_default def fetch_authors(doc, force=False, best_effort=False, max_retries=5): diff --git a/siskin/test_conversions.py b/siskin/test_conversions.py index 7404bf56..1b05777f 100644 --- a/siskin/test_conversions.py +++ b/siskin/test_conversions.py @@ -257,11 +257,11 @@ def test_osf_to_intermediate(): "pernyataan Allah m", "authors": [{"rft.aufirst": "Ceria", "rft.aulast": "Ceria"}], "doi": "10.31219/osf.io/egcsk", - "finc.format": "Article", - "finc.id": "ai-191-egcsk", - "finc.mega_collection": ["sid-191-col-osf", "Osf"], - "finc.source_id": "191", - "languages": ["eng"], + "finc.format": "Preprint", + "finc.id": "ai-179-egcsk", + "finc.mega_collection": ["sid-179-col-osf", "Osf"], + "finc.source_id": "179", + "languages": ["ind"], "rft.atitle": "Konsep Allah Dalam Teologi Proses", "rft.date": "2021-07-19", "rft.genre": "article",