From 624760d23cc11d76e836cb2f0c22b9b10ab42abd Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 16:44:10 +0300 Subject: [PATCH 01/30] Simplified language metadata JSON by removing unnecessary nesting and keys. - Removed 'description', 'entry', and 'languages' keys. - Flattened structure to include only 'language', 'iso', and 'qid' at the top level. --- .../resources/language_metadata.json | 98 ++++++------------- 1 file changed, 31 insertions(+), 67 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index e6d7de8a6..b5400c697 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -1,70 +1,34 @@ { - "used by": "Scribe-Data/src/scribe_data/utils.py", - "description": { - "entry": { - "language": "the supported language. All lowercase", - "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", - "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", - "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "words that should be removed from the autosuggestion generation process." - } + "english": { + "iso": "en", + "qid": "Q1860" }, - "languages": [ - { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"] - }, - { - "language": "german", - "iso": "de", - "qid": "Q188", - "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], - "ignore-words": ["Gemeinde", "Familienname"] - }, - { - "language": "italian", - "iso": "it", - "qid": "Q652", - "remove-words": ["of", "the", "The", "and", "text", "from"], - "ignore-words": ["The", "ATP"] - }, - { - "language": "portuguese", - "iso": "pt", - "qid": "Q5146", - "remove-words": ["of", "the", "The", "and", "jbutadptflora"], - "ignore-words": [] - }, - { - "language": "russian", - "iso": "ru", - "qid": "Q7737", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "spanish", - "iso": "es", - "qid": "Q1321", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "swedish", - "iso": "sv", - "qid": "Q9027", - "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], - "ignore-words": ["databasdump"] - } - ] + "french": { + "iso": "fr", + "qid": "Q150" + }, + "german": { + "iso": "de", + "qid": "Q188" + }, + "italian": { + "iso": "it", + "qid": "Q652" + }, + "portuguese": { + "iso": "pt", + "qid": "Q5146" + }, + "russian": { + "iso": "ru", + "qid": "Q7737" + }, + "spanish": { + "iso": "es", + "qid": "Q1321" + }, + "swedish": { + "iso": "sv", + "qid": "Q9027" + } } From 05ba79d41a08148c5e29d32b335b9524fab84d27 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 16:50:34 +0300 Subject: [PATCH 02/30] Refactored _load_json function to handle simplified JSON structure. - Removed 'root' parameter since the JSON is now flat. - Updated function to return the entire contents of the JSON directly. --- src/scribe_data/utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index dbd477946..4c3a78e3c 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -36,7 +36,7 @@ DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export" -def _load_json(package_path: str, file_name: str, root: str) -> Any: +def _load_json(package_path: str, file_name: str) -> Any: """ Loads a JSON resource from a package into a python entity. @@ -48,25 +48,19 @@ def _load_json(package_path: str, file_name: str, root: str) -> Any: file_name : str The name of the file (resource) that contains the JSON data. - root : str - The root node of the JSON document. - Returns ------- - A python entity starting at 'root'. + A python entity representing the JSON content. """ - with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: contents = json.load(in_stream) - return contents[root] + return contents # No need for 'root' _languages = _load_json( - package_path="scribe_data.resources", - file_name="language_metadata.json", - root="languages", + package_path="scribe_data.resources", file_name="language_metadata.json" ) From 7be7005789bd92791dc5d0952d3919d2b590f1db Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 18:25:15 +0300 Subject: [PATCH 03/30] =?UTF-8?q?Refactor=20language=20metadata=20structur?= =?UTF-8?q?e:=20Include=20all=20languages=20with=20Norwegian=20having=20su?= =?UTF-8?q?b-languags=20-=20Removed=20unnecessary=20top-level=20keys=20-?= =?UTF-8?q?=20Organized=20Norwegian=20with=20its=20sub-languages=20(Nynors?= =?UTF-8?q?k=20and=20Bokm=C3=A5l)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../resources/language_metadata.json | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index b5400c697..dd85cdc91 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -1,8 +1,40 @@ { + "arabic": { + "iso": "ar", + "qid": "Q13955" + }, + "basque": { + "iso": "eu", + "qid": "Q8752" + }, + "bengali": { + "iso": "bn", + "qid": "Q9610" + }, + "czech": { + "iso": "cs", + "qid": "Q9056" + }, + "danish": { + "iso": "da", + "qid": "Q9035" + }, "english": { "iso": "en", "qid": "Q1860" }, + "esperanto": { + "iso": "eo", + "qid": "Q143" + }, + "estonian": { + "iso": "et", + "qid": "Q9072" + }, + "finnish": { + "iso": "fi", + "qid": "Q1412" + }, "french": { "iso": "fr", "qid": "Q150" @@ -11,24 +43,116 @@ "iso": "de", "qid": "Q188" }, + "greek": { + "iso": "el", + "qid": "Q36510" + }, + "hausa": { + "iso": "ha", + "qid": "Q56475" + }, + "hebrew": { + "iso": "he", + "qid": "Q9288" + }, + "hindustani": { + "iso": "hi", + "qid": "Q11051" + }, + "indonesian": { + "iso": "id", + "qid": "Q9240" + }, "italian": { "iso": "it", "qid": "Q652" }, + "japanese": { + "iso": "ja", + "qid": "Q5287" + }, + "kurmanji": { + "iso": "kmr", + "qid": "Q36163" + }, + "latin": { + "iso": "la", + "qid": "Q397" + }, + "malay": { + "iso": "ms", + "qid": "Q9237" + }, + "malayalam": { + "iso": "ml", + "qid": "Q36236" + }, + "mandarin": { + "iso": "zh", + "qid": "Q727694" + }, + "norwegian": { + "sub_languages": { + "nynorsk": { + "iso": "nn", + "qid": "Q25164" + }, + "bokmål": { + "iso": "nb", + "qid": "Q9043" + } + } + }, + "pidgin": { + "iso": "pi", + "qid": "Q33655" + }, + "polish": { + "iso": "pl", + "qid": "Q809" + }, "portuguese": { "iso": "pt", "qid": "Q5146" }, + "punjabi": { + "iso": "pa", + "qid": "Q58635" + }, "russian": { "iso": "ru", "qid": "Q7737" }, + "slovak": { + "iso": "sk", + "qid": "Q9058" + }, "spanish": { "iso": "es", "qid": "Q1321" }, + "swahili": { + "iso": "sw", + "qid": "Q7838" + }, "swedish": { "iso": "sv", "qid": "Q9027" + }, + "tajik": { + "iso": "tg", + "qid": "Q9260" + }, + "tamil": { + "iso": "ta", + "qid": "Q5885" + }, + "ukrainian": { + "iso": "ua", + "qid": "Q8798" + }, + "yoruba": { + "iso": "yo", + "qid": "Q34311" } } From e1ce1d8a6d2ea72003bb61f4aac3678aec648270 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 20:43:17 +0300 Subject: [PATCH 04/30] Refactor _find function to handle languages with sub-languages - Enhanced the function to check for both regular languages and their sub-languages. - Added error handling for cases where a language has only sub-languages, providing informative messages. - Updated the function's docstring to reflect changes in behavior and usage. --- src/scribe_data/utils.py | 48 ++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 4c3a78e3c..45434b783 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -66,28 +66,20 @@ def _load_json(package_path: str, file_name: str) -> Any: def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -> Any: """ - Each 'language', (english, german,..., etc) is a dictionary of key/value pairs: + Finds a target value based on a source key/value pair from the language metadata. - entry = { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": [...], - "ignore-words": [...] - } - - Given a key/value pair, the 'source' and the 'target' key get the 'target' value. + This version handles both regular languages and those with sub-languages (e.g., Norwegian). Parameters ---------- source_value : str - The source value to find equivalents for (e.g. 'english'). + The source value to find equivalents for (e.g., 'english', 'nynorsk'). source_key : str - The source key to reference (e.g. 'language'). + The source key to reference (e.g., 'language'). target_key : str - The key to target (e.g. 'iso'). + The key to target (e.g., 'qid'). error_msg : str The message displayed when a value cannot be found. @@ -98,18 +90,30 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - Raises ------ - ValueError : when a source_value is not supported. + ValueError : when a source_value is not supported or the language only has sub-languages. """ norm_source_value = source_value.lower() - if target_value := [ - entry[target_key] - for entry in _languages - if entry[source_key] == norm_source_value - ]: - assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'" - return target_value[0] - + # Check if we're searching by language name + if source_key == "language": + # First, check the main language entries (e.g., mandarin, french, etc.) + for language, entry in _languages.items(): + # If the language name matches the top-level key, return the target value + if language.lower() == norm_source_value: + if "sub_languages" in entry: + sub_languages = ", ".join(entry["sub_languages"].keys()) + raise ValueError( + f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}" + ) + return entry.get(target_key) + + # If there are sub-languages, check them too + if "sub_languages" in entry: + for sub_language, sub_entry in entry["sub_languages"].items(): + if sub_language.lower() == norm_source_value: + return sub_entry.get(target_key) + + # If no match was found, raise an error raise ValueError(error_msg) From 046c78d94cf85acea433e6fd4e19093a03593cf1 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sat, 12 Oct 2024 20:46:10 +0300 Subject: [PATCH 05/30] Update get_scribe_languages to handle sub-languages in JSON structure - Adjusted the function to return both main languages and their sub-languages. - Ensured that languages like Norwegian are represented by their sub-languages only. - Enhanced compatibility with the new JSON format. --- src/scribe_data/utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 45434b783..bb9c7a399 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -120,8 +120,22 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - def get_scribe_languages() -> list[str]: """ Returns the list of currently implemented Scribe languages. + This version handles both regular languages and those with sub-languages (e.g., Norwegian). """ - return sorted(entry["language"].capitalize() for entry in _languages) + languages = [] + + for language, entry in _languages.items(): + # Add the main language (if it's directly queryable) + if "sub_languages" not in entry: + languages.append(language.capitalize()) + + # If there are sub-languages, add them instead + if "sub_languages" in entry: + languages.extend( + sub_language.capitalize() for sub_language in entry["sub_languages"] + ) + + return sorted(languages) def get_language_qid(language: str) -> str: From 8f737cd0a21e37e2eff6766c8be6f016bf6de647 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 18:00:29 +0300 Subject: [PATCH 06/30] Remove get_language_words_to_remove and get_language_words_to_ignore due to new language_metadata.json structure --- src/scribe_data/utils.py | 44 ---------------------------------------- 1 file changed, 44 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 494a2d1bf..03e356870 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -206,50 +206,6 @@ def get_language_from_iso(iso: str) -> str: return language_name -def get_language_words_to_remove(language: str) -> list[str]: - """ - Returns the words that should be removed during the data cleaning process for the given language. - - Parameters - ---------- - language : str - The language the words should be returned for. - - Returns - ------- - list[str] - The words that that be removed during the data cleaning process for the given language. - """ - return _find( - "language", - language, - "remove-words", - f"{language.capitalize()} is currently not a supported language.", - ) - - -def get_language_words_to_ignore(language: str) -> list[str]: - """ - Returns the words that should not be included as autosuggestions for the given language. - - Parameters - ---------- - language : str - The language the words should be returned for. - - Returns - ------- - list[str] - The words that should not be included as autosuggestions for the given language. - """ - return _find( - "language", - language, - "ignore-words", - f"{language.capitalize()} is currently not a supported language.", - ) - - def load_queried_data( file_path: str, language: str, data_type: str ) -> tuple[Any, bool, str]: From 9f75f5426cfa87bc51976ce28c95a6a065f4bc5e Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Sun, 13 Oct 2024 23:59:31 +0300 Subject: [PATCH 07/30] Refactor language_map and language_to_qid generation to handle new JSON structure - Updated the logic for building language_map and language_to_qid to handle languages with sub-languages. - Both main languages and sub-languages are now processed in a single pass, ensuring that: - language_map includes all metadata for main and sub-languages. - language_to_qid correctly maps both main and sub-languages to their QIDs. --- src/scribe_data/cli/cli_utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index ecf8b6213..f5b72f663 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -42,14 +42,23 @@ with DATA_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: data_type_metadata = json.load(file) -language_map = { - lang["language"].lower(): lang for lang in language_metadata["languages"] -} - -# Create language_to_qid dictionary. -language_to_qid = { - lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"] -} +language_map = {} +language_to_qid = {} + +# Process each language and its potential sub-languages in one pass +for lang_key, lang_data in language_metadata.items(): + lang_key_lower = lang_key.lower() + + # Handle sub-languages if they exist + if "sub_languages" in lang_data: + for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_key_lower = sub_lang_key.lower() + language_map[sub_lang_key_lower] = sub_lang_data + language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + else: + # Handle the main language directly + language_map[lang_key_lower] = lang_data + language_to_qid[lang_key_lower] = lang_data["qid"] def correct_data_type(data_type: str) -> str: From 6186be979c28b52acc9cc36bc0b8bf2536dbc31c Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 00:40:16 +0300 Subject: [PATCH 08/30] Fix: Update language extraction to match new JSON structure by removing the 'languages' key reference --- src/scribe_data/cli/interactive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 4e95f34b0..cefaa6bbe 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -52,7 +52,7 @@ class ScribeDataConfig: def __init__(self): self.languages = [ - lang["language"].capitalize() for lang in language_metadata["languages"] + [lang_key.capitalize() for lang_key in language_metadata.keys()] ] self.data_types = list(data_type_metadata.keys()) self.selected_languages: List[str] = [] From 1c959ec5d89f4d24e1f9f33f70b9e9a3289e86a8 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 00:48:56 +0300 Subject: [PATCH 09/30] Refactor language extraction to use direct keys from language_metadata. Removed dependency on the 'languages' key in JSON structure. --- src/scribe_data/wikidata/query_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 70c0fbf00..ffdc3bfba 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -115,7 +115,7 @@ def query_data( SCRIBE_DATA_SRC_PATH / "language_data_extraction" ) languages = [lang.capitalize() for lang in languages] - current_languages = list(language_metadata["languages"]) + current_languages = list(language_metadata.keys()) current_data_type = ["nouns", "verbs", "prepositions"] # Assign current_languages and current_data_type if no arguments have been passed. From 458328ef5086d8b190e66ae2e3aae5c5e37cdf19 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:13:54 +0300 Subject: [PATCH 10/30] Added format_sublanguage_name function to format sub-language names as 'mainlang/sublang' - Implemented the function to check if a language is a sub-language and format its name as 'mainlang/sublang' for easier searching in language_data_extraction. - Returns the original language name if it's not a sub-language. - Added detailed docstring for clarity and usage examples. --- src/scribe_data/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 03e356870..33fc3763e 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -487,3 +487,39 @@ def order_annotations(annotation: str) -> str: annotation_split = sorted(list(set(filter(None, annotation.split("/"))))) return "/".join(annotation_split) + + +def format_sublanguage_name(lang, language_metadata): + """ + Formats the name of a sub-language by appending its main language + in the format 'mainlang/sublang'. If the language is not a sub-language, + the original language name is returned as-is. + + Args: + lang (str): The name of the language or sub-language to format. + language_metadata (dict): The metadata containing information about + main languages and their sub-languages. + + Returns: + str: The formatted language name if it's a sub-language + (e.g., 'norwegian/nynorsk'), otherwise the original name. + + Example: + format_sublanguage_name("nynorsk", language_metadata) + 'norwegian/nynorsk' + + format_sublanguage_name("english", language_metadata) + 'english' + """ + # Iterate through the main languages in the metadata + for main_lang, lang_data in language_metadata.items(): + # Check if the main language has sub-languages + if "sub_languages" in lang_data: + # Check if the provided language is a sub-language + for sub_lang in lang_data["sub_languages"]: + if lang.lower() == sub_lang.lower(): + # Return the formatted name mainlang/sublang + return f"{main_lang}/{sub_lang}" + + # If it's not a sub-language, return the original name + return lang From e0177607afb489a34f882ba7db78649c5899cacf Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:22:11 +0300 Subject: [PATCH 11/30] Refactor: Apply format_sublanguage_name to handle sub-language - Wrapped 'lang' variable with format_sublanguage_name to ensure sub-languages are formatted as 'mainlang/sublang' during data extraction. - This ensures proper directory creation and querying for a sub-languages, aligning with the new language metadata structure. --- src/scribe_data/wikidata/query_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index ffdc3bfba..9c8e04d1e 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -33,6 +33,7 @@ from scribe_data.cli.cli_utils import ( language_metadata, ) +from scribe_data.utils import format_sublanguage_name from scribe_data.wikidata.wikidata_utils import sparql @@ -159,7 +160,7 @@ def query_data( disable=interactive, colour="MAGENTA", ): - lang = q.parent.parent.name + lang = format_sublanguage_name(q.parent.parent.name, language_metadata) target_type = q.parent.name updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir From 470541444c09dea57cb18dd1dcff894e505d89e3 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 14:31:45 +0300 Subject: [PATCH 12/30] Removed dependency on the 'languages' key based on the old json structure in cli/total.py file --- src/scribe_data/cli/total.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index e94d33d40..735d74051 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -71,8 +71,8 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ - languages = list(language_metadata["languages"]) - language_list = [lang["language"] for lang in languages] + languages = list(language_metadata.keys()) + language_list = [lang for lang in languages] if language.lower() in language_list: language_data = language_map.get(language.lower()) From ab7b6cf5be0b5ba0db2c965aee8f6b56acddcbb9 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 15:12:19 +0300 Subject: [PATCH 13/30] Add function to list all languages from language metadata loaded json - Created list_all_languages function to extract both main languages and sub-languages - The function checks for sub-languages and compiles a complete list for easier access. - Updated example usage to demonstrate the new functionality. --- src/scribe_data/utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 33fc3763e..1df502ad6 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -523,3 +523,20 @@ def format_sublanguage_name(lang, language_metadata): # If it's not a sub-language, return the original name return lang + + +def list_all_languages(language_metadata): + """List all languages from the provided metadata dictionary, including sub-languages.""" + current_languages = [] + + # Iterate through the language metadata + for lang_key, lang_data in language_metadata.items(): + # Check if there are sub-languages + if "sub_languages" in lang_data: + # Add the sub-languages to current_languages + current_languages.extend(lang_data["sub_languages"].keys()) + else: + # If no sub-languages, add the main language + current_languages.append(lang_key) + + return current_languages From 8d8f8f59ea8e1bda8783d552381c4c578b05f38d Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 15:14:37 +0300 Subject: [PATCH 14/30] Refactor to use list_all_languages function for language extraction - Replaced old extraction method with a centralized function. --- src/scribe_data/load/data_to_sqlite.py | 4 ++-- src/scribe_data/wikidata/query_data.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 79d19e39b..aec1f9560 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -35,6 +35,7 @@ DEFAULT_SQLITE_EXPORT_DIR, get_language_iso, ) +from scribe_data.utils import list_all_languages def data_to_sqlite( @@ -52,8 +53,7 @@ def data_to_sqlite( current_language_data = json.load(f_languages) data_types = json.load(f_data_types).keys() - current_languages = [d["language"] for d in current_language_data["languages"]] - + current_languages = list_all_languages(current_language_data) if not languages: languages = current_languages diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 9c8e04d1e..c075663a6 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -33,7 +33,7 @@ from scribe_data.cli.cli_utils import ( language_metadata, ) -from scribe_data.utils import format_sublanguage_name +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql @@ -116,7 +116,7 @@ def query_data( SCRIBE_DATA_SRC_PATH / "language_data_extraction" ) languages = [lang.capitalize() for lang in languages] - current_languages = list(language_metadata.keys()) + current_languages = list_all_languages(language_metadata) current_data_type = ["nouns", "verbs", "prepositions"] # Assign current_languages and current_data_type if no arguments have been passed. From d9a649b2681378475b19ab745031f607d6ca5616 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 16:39:14 +0300 Subject: [PATCH 15/30] Enhance language handling by importing utility functions - Imported list_all_languages and ormat_sublanguage_name from scribe_data.utils. - Updated get_datatype_list and print_total_lexemes to improve language name retrieval and formatting. --- src/scribe_data/cli/total.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 735d74051..990aef733 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -30,6 +30,7 @@ language_to_qid, ) from scribe_data.wikidata.wikidata_utils import sparql +from scribe_data.utils import list_all_languages, format_sublanguage_name def get_qid_by_input(input_str): @@ -71,12 +72,14 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ - languages = list(language_metadata.keys()) + languages = list_all_languages(language_metadata) language_list = [lang for lang in languages] if language.lower() in language_list: language_data = language_map.get(language.lower()) - language_capitalized = language.capitalize() + language_capitalized = format_sublanguage_name( + language, language_metadata + ).capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized if not language_data: @@ -131,9 +134,11 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) - language_list = [lang["language"] for lang in languages] + languages = list_all_languages( + language_metadata + ) # this returns a list of language names + language_list = languages # sorts the list in place + language_list.sort() for lang in language_list: data_types = get_datatype_list(lang) From 30f97e96883460261dd83e9fdfb4d6b6da8ba121 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 19:35:34 +0300 Subject: [PATCH 16/30] Update get_language_iso function: - Refactored to use the user-defined _find function. - Removed the ry-except block as error handling is already implemented in _find. - Removed the InvalidLanguageValue module as it was imported but unused. --- src/scribe_data/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 1df502ad6..9898f2449 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -27,7 +27,7 @@ from typing import Any, Optional from iso639 import Lang -from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue +from iso639.exceptions import DeprecatedLanguageValue PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" @@ -174,12 +174,13 @@ def get_language_iso(language: str) -> str: str The ISO code for the language. """ - try: - iso_code = str(Lang(language.capitalize()).pt1) - except InvalidLanguageValue: - raise ValueError( - f"{language.capitalize()} is currently not a supported language for ISO conversion." - ) from None + + iso_code = _find( + "language", + language, + "iso", + f"{language.upper()} is currently not a supported language for ISO conversion.", + ) return iso_code From ceec18768f2897c45e166cdc68fb462958944fd4 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 19:55:09 +0300 Subject: [PATCH 17/30] Handle sub-languages in language table generation - Utilized already built helper functions to support sub-languages when retrieving ISO and QID values. - Updated table printing to correctly format and display both main languages and sub-languages. --- src/scribe_data/cli/list.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 5d16b4413..6f8f2358e 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -26,18 +26,19 @@ language_map, LANGUAGE_DATA_EXTRACTION_DIR, ) +from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid def list_languages() -> None: """ Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) + languages = list_all_languages(language_metadata) + languages.sort() - language_col_width = max(len(lang["language"]) for lang in languages) + 2 - iso_col_width = max(len(lang["iso"]) for lang in languages) + 2 - qid_col_width = max(len(lang["qid"]) for lang in languages) + 2 + language_col_width = max(len(lang) for lang in languages) + 2 + iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 + qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 table_line_length = language_col_width + iso_col_width + qid_col_width @@ -49,7 +50,7 @@ def list_languages() -> None: for lang in languages: print( - f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}" ) print("-" * table_line_length) From 540e9d2c4e322a943c5c8b111453080415acfda7 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 21:27:17 +0300 Subject: [PATCH 18/30] adding new languages and their dialects to the language_metadata.json file --- .../resources/language_metadata.json | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index dd85cdc91..d7d8100cd 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -56,8 +56,16 @@ "qid": "Q9288" }, "hindustani": { - "iso": "hi", - "qid": "Q11051" + "sub_languages": { + "hindi": { + "iso": "hi", + "qid": "Q11051" + }, + "urdu": { + "iso": "ur", + "qid": "Q11051" + } + } }, "indonesian": { "iso": "id", @@ -104,8 +112,12 @@ } }, "pidgin": { - "iso": "pi", - "qid": "Q33655" + "sub_languages": { + "nigerian": { + "iso": "pi", + "qid": "Q33655" + } + } }, "polish": { "iso": "pl", @@ -116,8 +128,16 @@ "qid": "Q5146" }, "punjabi": { - "iso": "pa", - "qid": "Q58635" + "sub_languages": { + "gurmukhi": { + "iso": "pan", + "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnp", + "qid": "Q58635" + } + } }, "russian": { "iso": "ru", From f389ab5b833b5255c9bd3e6c2e92aca64f10ec5b Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 21:52:40 +0300 Subject: [PATCH 19/30] Modified the loop that searches languages in the list_data_types function to reflect the new JSON structure, ensuring only data types are printed and no sub-languages unlike before. --- src/scribe_data/cli/list.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6f8f2358e..6b9ec295c 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -26,7 +26,12 @@ language_map, LANGUAGE_DATA_EXTRACTION_DIR, ) -from scribe_data.utils import list_all_languages, get_language_iso, get_language_qid +from scribe_data.utils import ( + list_all_languages, + get_language_iso, + get_language_qid, + format_sublanguage_name, +) def list_languages() -> None: @@ -66,6 +71,7 @@ def list_data_types(language: str = None) -> None: language : str The language to potentially list data types for. """ + languages = list_all_languages(language_metadata) if language: language_data = language_map.get(language.lower()) language_capitalized = language.capitalize() @@ -84,8 +90,11 @@ def list_data_types(language: str = None) -> None: else: data_types = set() - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in languages: + language_dir = ( + LANGUAGE_DATA_EXTRACTION_DIR + / format_sublanguage_name(lang, language_metadata).capitalize() + ) if language_dir.is_dir(): data_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) From 09944edab9f064ad39a414b2775cc78c62578e49 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 22:24:19 +0300 Subject: [PATCH 20/30] Capitalize the languages returned by the function 'format_sublanguage_name' to align with the directory structure in the language_data_extraction directory. --- src/scribe_data/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 9898f2449..b4da68647 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -520,10 +520,10 @@ def format_sublanguage_name(lang, language_metadata): for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): # Return the formatted name mainlang/sublang - return f"{main_lang}/{sub_lang}" + return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" # If it's not a sub-language, return the original name - return lang + return lang.capitalize() def list_all_languages(language_metadata): From f602f170335ee6833a6c322206885ecf22c081ad Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Mon, 14 Oct 2024 22:29:02 +0300 Subject: [PATCH 21/30] Implemented minor fixes by utilizing the format_sublanguage_name function to handle sub_language folders. --- src/scribe_data/cli/list.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6b9ec295c..447d59060 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -73,6 +73,7 @@ def list_data_types(language: str = None) -> None: """ languages = list_all_languages(language_metadata) if language: + language = format_sublanguage_name(language, language_metadata) language_data = language_map.get(language.lower()) language_capitalized = language.capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized @@ -132,9 +133,11 @@ def list_languages_for_data_type(data_type: str) -> None: The data type to check for. """ data_type = correct_data_type(data_type=data_type) + all_languages = list_all_languages(language_metadata) available_languages = [] - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in all_languages: + lang = format_sublanguage_name(lang, language_metadata) + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize() if language_dir.is_dir(): dt_path = language_dir / data_type if dt_path.exists(): From ba0ed9a7c8ba2c042b9b98a4e574858c015de63c Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Tue, 15 Oct 2024 19:26:18 +0300 Subject: [PATCH 22/30] Updated the instance variable self.languages in ScribeDataConfig to use list_all_languages, assigning a complete list of all languages. --- src/scribe_data/cli/interactive.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index cefaa6bbe..6ba7a1f55 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -35,7 +35,7 @@ from scribe_data.cli.cli_utils import data_type_metadata, language_metadata from scribe_data.cli.get import get_data from scribe_data.cli.version import get_version_message -from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages # MARK: Config Setup @@ -51,9 +51,7 @@ class ScribeDataConfig: def __init__(self): - self.languages = [ - [lang_key.capitalize() for lang_key in language_metadata.keys()] - ] + self.languages = list_all_languages(language_metadata) self.data_types = list(data_type_metadata.keys()) self.selected_languages: List[str] = [] self.selected_data_types: List[str] = [] From c77cb1fdf1fbe38aa1381f3071ef308d47875581 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 17:22:25 +0300 Subject: [PATCH 23/30] adding mandarin as a sub language under chinese and updating some qids --- .../resources/language_metadata.json | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index d7d8100cd..00a8d405c 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -95,9 +95,13 @@ "iso": "ml", "qid": "Q36236" }, - "mandarin": { - "iso": "zh", - "qid": "Q727694" + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } }, "norwegian": { "sub_languages": { @@ -107,7 +111,7 @@ }, "bokmål": { "iso": "nb", - "qid": "Q9043" + "qid": "Q25167" } } }, @@ -129,12 +133,12 @@ }, "punjabi": { "sub_languages": { - "gurmukhi": { - "iso": "pan", + "shahmukhi": { + "iso": "pnb", "qid": "Q58635" }, - "shahmukhi": { - "iso": "pnp", + "gurmukhi": { + "iso": "pa", "qid": "Q58635" } } From 87ec3b03747e921e0b2d7c6c5801ae82d5baa06d Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 17:46:53 +0300 Subject: [PATCH 24/30] Update test_list_languages to match updated output format --- tests/cli/test_list.py | 55 +++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 1ec2ec1e4..3933082f6 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -39,17 +39,49 @@ def test_list_languages(self, mock_print): list_languages() expected_calls = [ call(), - call("Language ISO QID "), - call("-----------------------"), - call("English en Q1860 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Italian it Q652 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Spanish es Q1321 "), - call("Swedish sv Q9027 "), - call("-----------------------"), + call("Language ISO QID "), + call("--------------------------"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), + call("--------------------------"), call(), ] mock_print.assert_has_calls(expected_calls) @@ -80,6 +112,7 @@ def test_list_data_types_specific_language(self, mock_print): call("Available data types: English"), call("-----------------------------"), call("adjectives"), + call("adverbs"), call("emoji-keywords"), call("nouns"), call("verbs"), From 881c0553ece0246a7910cf2285f1d80b1013b1a4 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 20:28:44 +0300 Subject: [PATCH 25/30] removing .capitalize method since it's already implemented inside laguages listing functions --- src/scribe_data/cli/list.py | 6 ++--- tests/cli/test_list.py | 52 ++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 447d59060..ee3311ede 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -137,11 +137,11 @@ def list_languages_for_data_type(data_type: str) -> None: available_languages = [] for lang in all_languages: lang = format_sublanguage_name(lang, language_metadata) - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang.capitalize() + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang if language_dir.is_dir(): dt_path = language_dir / data_type if dt_path.exists(): - available_languages.append(lang["language"]) + available_languages.append(lang) available_languages.sort() table_header = f"Available languages: {data_type}" @@ -154,7 +154,7 @@ def list_languages_for_data_type(data_type: str) -> None: print("-" * table_line_length) for lang in available_languages: - print(f"{lang.capitalize()}") + print(f"{lang}") print("-" * table_line_length) print() diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 3933082f6..cad0fa549 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -97,6 +97,8 @@ def test_list_data_types_all_languages(self, mock_print): call("adverbs"), call("emoji-keywords"), call("nouns"), + call("personal-pronouns"), + call("postpositions"), call("prepositions"), call("verbs"), call("-----------------------------------"), @@ -175,16 +177,48 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Available languages: nouns"), + call("Language ISO QID "), call("--------------------------"), - call("English"), - call("French"), - call("German"), - call("Italian"), - call("Portuguese"), - call("Russian"), - call("Spanish"), - call("Swedish"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), call("--------------------------"), call(), ] From fed80b391b073fa8adc7657020236ab118cdc84a Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Wed, 16 Oct 2024 21:35:09 +0300 Subject: [PATCH 26/30] Updating test cases in test_list.py file to match newly added languages --- tests/cli/test_list.py | 82 +++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index cad0fa549..bc31f38f2 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -177,48 +177,48 @@ def test_list_languages_for_data_type_valid(self, mock_print): list_languages_for_data_type("nouns") expected_calls = [ call(), - call("Language ISO QID "), + call("Available languages: nouns"), call("--------------------------"), - call("Arabic ar Q13955 "), - call("Basque eu Q8752 "), - call("Bengali bn Q9610 "), - call("Bokmål nb Q25167 "), - call("Czech cs Q9056 "), - call("Danish da Q9035 "), - call("English en Q1860 "), - call("Esperanto eo Q143 "), - call("Estonian et Q9072 "), - call("Finnish fi Q1412 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Greek el Q36510 "), - call("Gurmukhi pa Q58635 "), - call("Hausa ha Q56475 "), - call("Hebrew he Q9288 "), - call("Hindi hi Q11051 "), - call("Indonesian id Q9240 "), - call("Italian it Q652 "), - call("Japanese ja Q5287 "), - call("Kurmanji kmr Q36163 "), - call("Latin la Q397 "), - call("Malay ms Q9237 "), - call("Malayalam ml Q36236 "), - call("Mandarin zh Q727694 "), - call("Nigerian pi Q33655 "), - call("Nynorsk nn Q25164 "), - call("Polish pl Q809 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Shahmukhi pnb Q58635 "), - call("Slovak sk Q9058 "), - call("Spanish es Q1321 "), - call("Swahili sw Q7838 "), - call("Swedish sv Q9027 "), - call("Tajik tg Q9260 "), - call("Tamil ta Q5885 "), - call("Ukrainian ua Q8798 "), - call("Urdu ur Q11051 "), - call("Yoruba yo Q34311 "), + call("Arabic"), + call("Basque"), + call("Bengali"), + call("Chinese/Mandarin"), + call("Czech"), + call("Danish"), + call("English"), + call("Esperanto"), + call("Estonian"), + call("Finnish"), + call("French"), + call("German"), + call("Greek"), + call("Hausa"), + call("Hebrew"), + call("Hindustani/Hindi"), + call("Hindustani/Urdu"), + call("Indonesian"), + call("Italian"), + call("Japanese"), + call("Kurmanji"), + call("Latin"), + call("Malay"), + call("Malayalam"), + call("Norwegian/Bokmål"), + call("Norwegian/Nynorsk"), + call("Pidgin/Nigerian"), + call("Polish"), + call("Portuguese"), + call("Punjabi/Gurmukhi"), + call("Punjabi/Shahmukhi"), + call("Russian"), + call("Slovak"), + call("Spanish"), + call("Swahili"), + call("Swedish"), + call("Tajik"), + call("Tamil"), + call("Ukrainian"), + call("Yoruba"), call("--------------------------"), call(), ] From e6140e5052d2994bd6ff5da78a11e63448d144c7 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 00:31:59 +0300 Subject: [PATCH 27/30] Update test cases to include sub-languages - Updated all test cases to account for sub-languages. - Removed tests for est_get_language_words_to_remove and est_get_language_words_to_ignore, as these functions were deleted from utils.py and the languages metadata files --- tests/load/test_update_utils.py | 123 ++++++++++---------------------- 1 file changed, 36 insertions(+), 87 deletions(-) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 638ee09dd..489abc4b8 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -38,14 +38,46 @@ def test_get_scribe_languages(): test_case.assertCountEqual( utils.get_scribe_languages(), [ + "Arabic", + "Basque", + "Bengali", + "Bokmål", + "Czech", + "Danish", "English", + "Esperanto", + "Estonian", + "Finnish", "French", "German", + "Greek", + "Gurmukhi", + "Hausa", + "Hebrew", + "Hindi", + "Indonesian", "Italian", + "Japanese", + "Kurmanji", + "Latin", + "Malay", + "Malayalam", + "Mandarin", + "Nigerian", + "Nynorsk", + "Polish", "Portuguese", "Russian", + "Shahmukhi", + "Slovak", "Spanish", + "Swahili", "Swedish", + "Tajik", + "Tamil", + "Ukrainian", + "Urdu", + "Yoruba", ], ) @@ -61,6 +93,7 @@ def test_get_scribe_languages(): ("russian", "Q7737"), ("spanish", "Q1321"), ("swedish", "Q9027"), + ("bokmål", "Q25167"), ], ) def test_get_language_qid_positive(language, qid_code): @@ -88,6 +121,7 @@ def test_get_language_qid_negative(): ("russian", "ru"), ("spanish", "es"), ("SwedisH", "sv"), + ("bokmål", "nb"), ], ) def test_get_language_iso_positive(language, iso_code): @@ -100,7 +134,7 @@ def test_get_language_iso_negative(): assert ( str(excp.value) - == "Gibberish is currently not a supported language for ISO conversion." + == "GIBBERISH is currently not a supported language for ISO conversion." ) @@ -115,6 +149,7 @@ def test_get_language_iso_negative(): ("ru", "Russian"), ("es", "Spanish"), ("sv", "Swedish"), + ("nb", "Bokmål"), ], ) def test_get_language_from_iso_positive(iso_code, language): @@ -128,92 +163,6 @@ def test_get_language_from_iso_negative(): assert str(excp.value) == "IXI is currently not a supported ISO language." -@pytest.mark.parametrize( - "language, remove_words", - [ - ( - "english", - [ - "of", - "the", - "The", - "and", - ], - ), - ( - "french", - [ - "of", - "the", - "The", - "and", - ], - ), - ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]), - ("italian", ["of", "the", "The", "and", "text", "from"]), - ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]), - ( - "russian", - [ - "of", - "the", - "The", - "and", - ], - ), - ("spanish", ["of", "the", "The", "and"]), - ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]), - ], -) -def test_get_language_words_to_remove(language, remove_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_remove(language), remove_words - ) - - -def test_get_language_words_to_remove_negative(): - with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_remove("python") - - assert str(excp.value) == "Python is currently not a supported language." - - -@pytest.mark.parametrize( - "language, ignore_words", - [ - ( - "french", - [ - "XXe", - ], - ), - ("german", ["Gemeinde", "Familienname"]), - ("italian", ["The", "ATP"]), - ("portuguese", []), - ("russian", []), - ("spanish", []), - ("swedish", ["databasdump"]), - ], -) -def test_get_language_words_to_ignore(language, ignore_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_ignore(language), ignore_words - ) - - -def test_get_language_words_to_ignore_negative(): - with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_ignore("JAVA") - - assert str(excp.value) == "Java is currently not a supported language." - - def test_get_ios_data_path(): assert ( utils.get_ios_data_path("suomi") From 22791cec7696ff87b086d772f1b4d6ed07eff3ad Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 01:37:28 +0300 Subject: [PATCH 28/30] Updated the get_language_from_iso function to depend on the JSON file. Made the language_metadata parameter optional in two functions. Added a ValueError exception when a language is not found. --- src/scribe_data/utils.py | 47 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index b4da68647..df22a9a9a 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,8 +26,6 @@ from pathlib import Path from typing import Any, Optional -from iso639 import Lang -from iso639.exceptions import DeprecatedLanguageValue PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" @@ -198,13 +196,20 @@ def get_language_from_iso(iso: str) -> str: str The name for the language which has an ISO value of iso. """ - try: - language_name = str(Lang(iso.lower()).name) - except DeprecatedLanguageValue as e: - raise ValueError( - f"{iso.upper()} is currently not a supported ISO language." - ) from e - return language_name + # Iterate over the languages and their properties + for language, properties in _languages.items(): + # Check if the current language's ISO matches the provided ISO + if properties.get("iso") == iso: + return language.capitalize() + + # If there are sub-languages, check those as well + if "sub_languages" in properties: + for sub_lang, sub_properties in properties["sub_languages"].items(): + if sub_properties.get("iso") == iso: + return sub_lang.capitalize() + + # If no match is found, raise a ValueError + raise ValueError(f"{iso.upper()} is currently not a supported ISO language.") def load_queried_data( @@ -490,10 +495,10 @@ def order_annotations(annotation: str) -> str: return "/".join(annotation_split) -def format_sublanguage_name(lang, language_metadata): +def format_sublanguage_name(lang, language_metadata=_languages): """ Formats the name of a sub-language by appending its main language - in the format 'mainlang/sublang'. If the language is not a sub-language, + in the format 'Mainlang/Sublang'. If the language is not a sub-language, the original language name is returned as-is. Args: @@ -503,30 +508,36 @@ def format_sublanguage_name(lang, language_metadata): Returns: str: The formatted language name if it's a sub-language - (e.g., 'norwegian/nynorsk'), otherwise the original name. + (e.g., 'Norwegian/Nynorsk'), otherwise the original name. + + Raises: + ValueError: If the provided language or sub-language is not found. Example: format_sublanguage_name("nynorsk", language_metadata) - 'norwegian/nynorsk' + 'Norwegian/Nynorsk' format_sublanguage_name("english", language_metadata) - 'english' + 'English' """ # Iterate through the main languages in the metadata for main_lang, lang_data in language_metadata.items(): + # If it's not a sub-language, return the original name + if main_lang == lang.lower(): + return lang.capitalize() # Check if the main language has sub-languages if "sub_languages" in lang_data: # Check if the provided language is a sub-language for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): - # Return the formatted name mainlang/sublang + # Return the formatted name Mainlang/Sublang return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" - # If it's not a sub-language, return the original name - return lang.capitalize() + # Raise ValueError if no match is found + raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") -def list_all_languages(language_metadata): +def list_all_languages(language_metadata=_languages): """List all languages from the provided metadata dictionary, including sub-languages.""" current_languages = [] From 1416134a84c99227998212fb13bc5fa83d29c66b Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Thu, 17 Oct 2024 01:39:25 +0300 Subject: [PATCH 29/30] Add unit tests for language formatting and listing: - Positive and negative tests for format_sublanguage_name - Test to validate the output of list_all_languages --- tests/load/test_update_utils.py | 66 +++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 489abc4b8..df37317a3 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -163,6 +163,72 @@ def test_get_language_from_iso_negative(): assert str(excp.value) == "IXI is currently not a supported ISO language." +@pytest.mark.parametrize( + "lang, expected_output", + [ + ("nynorsk", "Norwegian/Nynorsk"), + ("bokmål", "Norwegian/Bokmål"), + ("english", "English"), + ], +) +def test_format_sublanguage_name_positive(lang, expected_output): + assert utils.format_sublanguage_name(lang) == expected_output + + +def test_format_sublanguage_name_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.format_sublanguage_name("soccer") + + assert str(excp.value) == "SOCCER is not a valid language or sub-language." + + +def test_list_all_languages(): + expected_languages = [ + "arabic", + "basque", + "bengali", + "czech", + "danish", + "english", + "esperanto", + "estonian", + "finnish", + "french", + "german", + "greek", + "hausa", + "hebrew", + "hindi", + "urdu", + "indonesian", + "italian", + "japanese", + "kurmanji", + "latin", + "malay", + "malayalam", + "mandarin", + "nynorsk", + "bokmål", + "nigerian", + "polish", + "portuguese", + "shahmukhi", + "gurmukhi", + "russian", + "slovak", + "spanish", + "swahili", + "swedish", + "tajik", + "tamil", + "ukrainian", + "yoruba", + ] + + assert utils.list_all_languages() == expected_languages + + def test_get_ios_data_path(): assert ( utils.get_ios_data_path("suomi") From 661b131cff45f947d3d33eac705363bd8c0944f9 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 18 Oct 2024 03:05:02 +0200 Subject: [PATCH 30/30] Edits to language metadata and supporting functions + pr checklist --- .github/PULL_REQUEST_TEMPLATE.md | 1 + CONTRIBUTING.md | 11 ++ src/scribe_data/cli/cli_utils.py | 81 +++++----- src/scribe_data/cli/list.py | 9 +- src/scribe_data/cli/total.py | 13 +- .../resources/language_metadata.json | 32 ++-- src/scribe_data/utils.py | 150 +++++++++--------- tests/cli/test_utils.py | 10 +- tests/load/test_update_utils.py | 62 +------- 9 files changed, 158 insertions(+), 211 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bab97a1a8..17c07e1c1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch +- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing) --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 376a954a7..2e44c618e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u - [First steps as a contributor](#first-steps) - [Learning the tech stack](#learning-the-tech) - [Development environment](#dev-env) +- [Testing](#testing) - [Issues and projects](#issues-projects) - [Bug reports](#bug-reports) - [Feature requests](#feature-requests) @@ -171,6 +172,16 @@ pip install -e . > [!NOTE] > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup! + + +## Testing [`⇧`](#contents) + +In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root: + +```bash +pytest +``` + ## Issues and projects [`⇧`](#contents) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index be2fa0f79..e39e1621d 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -27,6 +27,8 @@ from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +# MARK: CLI Variables + LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" LANGUAGE_METADATA_FILE = ( @@ -56,20 +58,21 @@ language_map = {} language_to_qid = {} -# Process each language and its potential sub-languages in one pass -for lang_key, lang_data in language_metadata.items(): - lang_key_lower = lang_key.lower() +# Process each language and its potential sub-languages in one pass. +for lang, lang_data in language_metadata.items(): + lang_lower = lang.lower() - # Handle sub-languages if they exist + # Handle sub-languages if they exist. if "sub_languages" in lang_data: - for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_key_lower = sub_lang_key.lower() - language_map[sub_lang_key_lower] = sub_lang_data - language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_lower = sub_lang.lower() + language_map[sub_lang_lower] = sub_lang_data + language_to_qid[sub_lang_lower] = sub_lang_data["qid"] + else: - # Handle the main language directly - language_map[lang_key_lower] = lang_data - language_to_qid[lang_key_lower] = lang_data["qid"] + # Handle the main language directly. + language_map[lang_lower] = lang_data + language_to_qid[lang_lower] = lang_data["qid"] # MARK: Correct Inputs @@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None: if isinstance(data, dict): max_key_length = max((len(key) for key in data.keys()), default=0) - if data_type == "autosuggestions": - for key, value in data.items(): + for key, value in data.items(): + if data_type == "autosuggestions": print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif data_type == "emoji_keywords": - for key, value in data.items(): + elif data_type == "emoji_keywords": emojis = [item["emoji"] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif data_type in {"prepositions"}: - for key, value in data.items(): + elif data_type in {"prepositions"}: print(f"{key:<{max_key_length}} : {value}") - else: - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max( - (len(sub_key) for sub_key in value.keys()), default=0 - ) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - - else: - print(f" {item}") - - else: - print(f"{key:<{max_key_length}} : {value}") + elif isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") elif isinstance(data, list): for item in data: @@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type): ): closest_match = difflib.get_close_matches(item, valid_options, n=1) closest_match_str = ( - f" The closest matching {item_type} is {closest_match[0]}." + f" The closest matching {item_type} is '{closest_match[0]}'." if closest_match else "" ) - return f"Invalid {item_type} {item}.{closest_match_str}" + return f"Invalid {item_type} '{item}'.{closest_match_str}" return None diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index ee3311ede..762d3bfca 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -21,16 +21,16 @@ """ from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, correct_data_type, - language_metadata, language_map, - LANGUAGE_DATA_EXTRACTION_DIR, + language_metadata, ) from scribe_data.utils import ( - list_all_languages, + format_sublanguage_name, get_language_iso, get_language_qid, - format_sublanguage_name, + list_all_languages, ) @@ -39,7 +39,6 @@ def list_languages() -> None: Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ languages = list_all_languages(language_metadata) - languages.sort() language_col_width = max(len(lang) for lang in languages) + 2 iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 5530ef5db..885d9b3e9 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -29,8 +29,8 @@ language_metadata, language_to_qid, ) +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql -from scribe_data.utils import list_all_languages, format_sublanguage_name def get_qid_by_input(input_str): @@ -73,9 +73,8 @@ def get_datatype_list(language): A list of the corresponding data types. """ languages = list_all_languages(language_metadata) - language_list = [lang for lang in languages] - if language.lower() in language_list: + if language.lower() in languages: language_data = language_map.get(language.lower()) language_capitalized = format_sublanguage_name( language, language_metadata @@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list_all_languages( - language_metadata - ) # this returns a list of language names - language_list = languages # sorts the list in place - language_list.sort() + languages = list_all_languages(language_metadata) - for lang in language_list: + for lang in languages: data_types = get_datatype_list(lang) first_row = True diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 00a8d405c..7ab2145bf 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -11,6 +11,14 @@ "iso": "bn", "qid": "Q9610" }, + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } + }, "czech": { "iso": "cs", "qid": "Q9056" @@ -95,23 +103,15 @@ "iso": "ml", "qid": "Q36236" }, - "chinese": { - "sub_languages": { - "mandarin": { - "iso": "zh", - "qid": "Q727694" - } - } - }, "norwegian": { "sub_languages": { - "nynorsk": { - "iso": "nn", - "qid": "Q25164" - }, "bokmål": { "iso": "nb", "qid": "Q25167" + }, + "nynorsk": { + "iso": "nn", + "qid": "Q25164" } } }, @@ -133,13 +133,13 @@ }, "punjabi": { "sub_languages": { - "shahmukhi": { - "iso": "pnb", - "qid": "Q58635" - }, "gurmukhi": { "iso": "pa", "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnb", + "qid": "Q58635" } } }, diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index df22a9a9a..3c2007640 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,7 +26,6 @@ from pathlib import Path from typing import Any, Optional - PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export" @@ -53,8 +52,7 @@ def _load_json(package_path: str, file_name: str) -> Any: with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: - contents = json.load(in_stream) - return contents # No need for 'root' + return json.load(in_stream) _languages = _load_json( @@ -90,13 +88,13 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ------ ValueError : when a source_value is not supported or the language only has sub-languages. """ - norm_source_value = source_value.lower() - - # Check if we're searching by language name + # Check if we're searching by language name. if source_key == "language": - # First, check the main language entries (e.g., mandarin, french, etc.) + norm_source_value = source_value.lower() + + # First, check the main language entries (e.g., mandarin, french, etc.). for language, entry in _languages.items(): - # If the language name matches the top-level key, return the target value + # If the language name matches the top-level key, return the target value. if language.lower() == norm_source_value: if "sub_languages" in entry: sub_languages = ", ".join(entry["sub_languages"].keys()) @@ -105,37 +103,16 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ) return entry.get(target_key) - # If there are sub-languages, check them too + # If there are sub-languages, check them too. if "sub_languages" in entry: for sub_language, sub_entry in entry["sub_languages"].items(): if sub_language.lower() == norm_source_value: return sub_entry.get(target_key) - # If no match was found, raise an error + # If no match was found, raise an error. raise ValueError(error_msg) -def get_scribe_languages() -> list[str]: - """ - Returns the list of currently implemented Scribe languages. - This version handles both regular languages and those with sub-languages (e.g., Norwegian). - """ - languages = [] - - for language, entry in _languages.items(): - # Add the main language (if it's directly queryable) - if "sub_languages" not in entry: - languages.append(language.capitalize()) - - # If there are sub-languages, add them instead - if "sub_languages" in entry: - languages.extend( - sub_language.capitalize() for sub_language in entry["sub_languages"] - ) - - return sorted(languages) - - def get_language_qid(language: str) -> str: """ Returns the QID of the given language. @@ -173,13 +150,12 @@ def get_language_iso(language: str) -> str: The ISO code for the language. """ - iso_code = _find( + return _find( "language", language, "iso", f"{language.upper()} is currently not a supported language for ISO conversion.", ) - return iso_code def get_language_from_iso(iso: str) -> str: @@ -433,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str: ---------- wikidata_gender : str The gender of the noun that was queried from WikiData. + + Returns + ------- + The gender value corrected in case the Wikidata ID was queried. """ gender_map = { - "masculine": "M", - "Q499327": "M", - "feminine": "F", - "Q1775415": "F", - "common gender": "C", - "Q1305037": "C", - "neuter": "N", - "Q1775461": "N", + "masculine": "masculine", + "Q499327": "masculine", + "feminine": "feminine", + "Q1775415": "feminine", + "common": "common", + "common gender": "common", + "Q1305037": "common", + "neuter": "neuter", + "Q1775461": "neuter", } return gender_map.get( - wikidata_gender, "" + wikidata_gender.lower(), "" ) # nouns could have a gender that is not a valid attribute @@ -458,20 +439,24 @@ def map_cases(wikidata_case: str) -> str: ---------- wikidata_case : str The case of the noun that was queried from WikiData. + + Returns + ------- + The case value corrected in case the Wikidata ID was queried. """ case_map = { - "accusative": "Acc", - "Q146078": "Acc", - "dative": "Dat", - "Q145599": "Dat", - "genitive": "Gen", - "Q146233": "Gen", - "instrumental": "Ins", - "Q192997": "Ins", - "prepositional": "Pre", - "Q2114906": "Pre", - "locative": "Loc", - "Q202142": "Loc", + "accusative": "accusative", + "Q146078": "accusative", + "dative": "dative", + "Q145599": "dative", + "genitive": "genitive", + "Q146233": "genitive", + "instrumental": "instrumental", + "Q192997": "instrumental", + "prepositional": "prepositional", + "Q2114906": "prepositional", + "locative": "locative", + "Q202142": "locative", } case = wikidata_case.split(" case")[0] return case_map.get(case, "") @@ -498,57 +483,66 @@ def order_annotations(annotation: str) -> str: def format_sublanguage_name(lang, language_metadata=_languages): """ Formats the name of a sub-language by appending its main language - in the format 'Mainlang/Sublang'. If the language is not a sub-language, + in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language, the original language name is returned as-is. - Args: - lang (str): The name of the language or sub-language to format. - language_metadata (dict): The metadata containing information about - main languages and their sub-languages. + Parameters + ---------- + lang : str + The name of the language or sub-language to format. - Returns: - str: The formatted language name if it's a sub-language - (e.g., 'Norwegian/Nynorsk'), otherwise the original name. + language_metadata : dict + The metadata containing information about main languages and their sub-languages. - Raises: + Returns + ------- + str + The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk'). + Otherwise the original name. + + Raises + ------ ValueError: If the provided language or sub-language is not found. - Example: - format_sublanguage_name("nynorsk", language_metadata) + Example + ------- + > format_sublanguage_name("nynorsk", language_metadata) 'Norwegian/Nynorsk' - format_sublanguage_name("english", language_metadata) + > format_sublanguage_name("english", language_metadata) 'English' """ - # Iterate through the main languages in the metadata for main_lang, lang_data in language_metadata.items(): - # If it's not a sub-language, return the original name + # If it's not a sub-language, return the original name. if main_lang == lang.lower(): return lang.capitalize() - # Check if the main language has sub-languages + + # Check if the main language has sub-languages. if "sub_languages" in lang_data: - # Check if the provided language is a sub-language + # Check if the provided language is a sub-language. for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): - # Return the formatted name Mainlang/Sublang + # Return the formatted name MAIN_LANG/SUB_LANG. return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" - # Raise ValueError if no match is found + # Raise ValueError if no match is found. raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") def list_all_languages(language_metadata=_languages): - """List all languages from the provided metadata dictionary, including sub-languages.""" + """ + Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages. + """ current_languages = [] - # Iterate through the language metadata + # Iterate through the language metadata. for lang_key, lang_data in language_metadata.items(): - # Check if there are sub-languages + # Check if there are sub-languages. if "sub_languages" in lang_data: - # Add the sub-languages to current_languages + # Add the sub-languages to current_languages. current_languages.extend(lang_data["sub_languages"].keys()) else: - # If no sub-languages, add the main language + # If no sub-languages, add the main language. current_languages.append(lang_key) - return current_languages + return sorted(current_languages) diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index a827666a2..333c3b7d7 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.") + self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): @@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.") + self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): @@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.", + "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.", ) def test_validate_language_and_data_type_with_list(self): @@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self): data_types = ["nouns", "InvalidDataType"] with self.assertRaises(ValueError) as context: validate_language_and_data_type(languages, data_types) - self.assertIn("Invalid language InvalidLanguage", str(context.exception)) - self.assertIn("Invalid data-type InvalidDataType", str(context.exception)) + self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception)) + self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception)) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index df37317a3..43eaa2038 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -21,7 +21,6 @@ """ import sys -import unittest from pathlib import Path import pytest @@ -31,57 +30,6 @@ from scribe_data import utils -def test_get_scribe_languages(): - test_case = unittest.TestCase() - - # test for content, not order - test_case.assertCountEqual( - utils.get_scribe_languages(), - [ - "Arabic", - "Basque", - "Bengali", - "Bokmål", - "Czech", - "Danish", - "English", - "Esperanto", - "Estonian", - "Finnish", - "French", - "German", - "Greek", - "Gurmukhi", - "Hausa", - "Hebrew", - "Hindi", - "Indonesian", - "Italian", - "Japanese", - "Kurmanji", - "Latin", - "Malay", - "Malayalam", - "Mandarin", - "Nigerian", - "Nynorsk", - "Polish", - "Portuguese", - "Russian", - "Shahmukhi", - "Slovak", - "Spanish", - "Swahili", - "Swedish", - "Tajik", - "Tamil", - "Ukrainian", - "Urdu", - "Yoruba", - ], - ) - - @pytest.mark.parametrize( "language, qid_code", [ @@ -187,6 +135,7 @@ def test_list_all_languages(): "arabic", "basque", "bengali", + "bokmål", "czech", "danish", "english", @@ -196,10 +145,10 @@ def test_list_all_languages(): "french", "german", "greek", + "gurmukhi", "hausa", "hebrew", "hindi", - "urdu", "indonesian", "italian", "japanese", @@ -208,14 +157,12 @@ def test_list_all_languages(): "malay", "malayalam", "mandarin", - "nynorsk", - "bokmål", "nigerian", + "nynorsk", "polish", "portuguese", - "shahmukhi", - "gurmukhi", "russian", + "shahmukhi", "slovak", "spanish", "swahili", @@ -223,6 +170,7 @@ def test_list_all_languages(): "tajik", "tamil", "ukrainian", + "urdu", "yoruba", ]