diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bab97a1a8..17c07e1c1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch +- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing) --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 376a954a7..2e44c618e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u - [First steps as a contributor](#first-steps) - [Learning the tech stack](#learning-the-tech) - [Development environment](#dev-env) +- [Testing](#testing) - [Issues and projects](#issues-projects) - [Bug reports](#bug-reports) - [Feature requests](#feature-requests) @@ -171,6 +172,16 @@ pip install -e . > [!NOTE] > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup! + + +## Testing [`⇧`](#contents) + +In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root: + +```bash +pytest +``` + ## Issues and projects [`⇧`](#contents) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index be2fa0f79..e39e1621d 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -27,6 +27,8 @@ from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +# MARK: CLI Variables + LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" LANGUAGE_METADATA_FILE = ( @@ -56,20 +58,21 @@ language_map = {} language_to_qid = {} -# Process each language and its potential sub-languages in one pass -for lang_key, lang_data in language_metadata.items(): - lang_key_lower = lang_key.lower() +# Process each language and its potential sub-languages in one pass. +for lang, lang_data in language_metadata.items(): + lang_lower = lang.lower() - # Handle sub-languages if they exist + # Handle sub-languages if they exist. if "sub_languages" in lang_data: - for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_key_lower = sub_lang_key.lower() - language_map[sub_lang_key_lower] = sub_lang_data - language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"] + for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_lower = sub_lang.lower() + language_map[sub_lang_lower] = sub_lang_data + language_to_qid[sub_lang_lower] = sub_lang_data["qid"] + else: - # Handle the main language directly - language_map[lang_key_lower] = lang_data - language_to_qid[lang_key_lower] = lang_data["qid"] + # Handle the main language directly. + language_map[lang_lower] = lang_data + language_to_qid[lang_lower] = lang_data["qid"] # MARK: Correct Inputs @@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None: if isinstance(data, dict): max_key_length = max((len(key) for key in data.keys()), default=0) - if data_type == "autosuggestions": - for key, value in data.items(): + for key, value in data.items(): + if data_type == "autosuggestions": print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif data_type == "emoji_keywords": - for key, value in data.items(): + elif data_type == "emoji_keywords": emojis = [item["emoji"] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif data_type in {"prepositions"}: - for key, value in data.items(): + elif data_type in {"prepositions"}: print(f"{key:<{max_key_length}} : {value}") - else: - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max( - (len(sub_key) for sub_key in value.keys()), default=0 - ) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - - else: - print(f" {item}") - - else: - print(f"{key:<{max_key_length}} : {value}") + elif isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") elif isinstance(data, list): for item in data: @@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type): ): closest_match = difflib.get_close_matches(item, valid_options, n=1) closest_match_str = ( - f" The closest matching {item_type} is {closest_match[0]}." + f" The closest matching {item_type} is '{closest_match[0]}'." if closest_match else "" ) - return f"Invalid {item_type} {item}.{closest_match_str}" + return f"Invalid {item_type} '{item}'.{closest_match_str}" return None diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index ee3311ede..762d3bfca 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -21,16 +21,16 @@ """ from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, correct_data_type, - language_metadata, language_map, - LANGUAGE_DATA_EXTRACTION_DIR, + language_metadata, ) from scribe_data.utils import ( - list_all_languages, + format_sublanguage_name, get_language_iso, get_language_qid, - format_sublanguage_name, + list_all_languages, ) @@ -39,7 +39,6 @@ def list_languages() -> None: Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ languages = list_all_languages(language_metadata) - languages.sort() language_col_width = max(len(lang) for lang in languages) + 2 iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 5530ef5db..885d9b3e9 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -29,8 +29,8 @@ language_metadata, language_to_qid, ) +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql -from scribe_data.utils import list_all_languages, format_sublanguage_name def get_qid_by_input(input_str): @@ -73,9 +73,8 @@ def get_datatype_list(language): A list of the corresponding data types. """ languages = list_all_languages(language_metadata) - language_list = [lang for lang in languages] - if language.lower() in language_list: + if language.lower() in languages: language_data = language_map.get(language.lower()) language_capitalized = format_sublanguage_name( language, language_metadata @@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list_all_languages( - language_metadata - ) # this returns a list of language names - language_list = languages # sorts the list in place - language_list.sort() + languages = list_all_languages(language_metadata) - for lang in language_list: + for lang in languages: data_types = get_datatype_list(lang) first_row = True diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 00a8d405c..7ab2145bf 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -11,6 +11,14 @@ "iso": "bn", "qid": "Q9610" }, + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } + }, "czech": { "iso": "cs", "qid": "Q9056" @@ -95,23 +103,15 @@ "iso": "ml", "qid": "Q36236" }, - "chinese": { - "sub_languages": { - "mandarin": { - "iso": "zh", - "qid": "Q727694" - } - } - }, "norwegian": { "sub_languages": { - "nynorsk": { - "iso": "nn", - "qid": "Q25164" - }, "bokmål": { "iso": "nb", "qid": "Q25167" + }, + "nynorsk": { + "iso": "nn", + "qid": "Q25164" } } }, @@ -133,13 +133,13 @@ }, "punjabi": { "sub_languages": { - "shahmukhi": { - "iso": "pnb", - "qid": "Q58635" - }, "gurmukhi": { "iso": "pa", "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnb", + "qid": "Q58635" } } }, diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index df22a9a9a..3c2007640 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,7 +26,6 @@ from pathlib import Path from typing import Any, Optional - PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export" @@ -53,8 +52,7 @@ def _load_json(package_path: str, file_name: str) -> Any: with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: - contents = json.load(in_stream) - return contents # No need for 'root' + return json.load(in_stream) _languages = _load_json( @@ -90,13 +88,13 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ------ ValueError : when a source_value is not supported or the language only has sub-languages. """ - norm_source_value = source_value.lower() - - # Check if we're searching by language name + # Check if we're searching by language name. if source_key == "language": - # First, check the main language entries (e.g., mandarin, french, etc.) + norm_source_value = source_value.lower() + + # First, check the main language entries (e.g., mandarin, french, etc.). for language, entry in _languages.items(): - # If the language name matches the top-level key, return the target value + # If the language name matches the top-level key, return the target value. if language.lower() == norm_source_value: if "sub_languages" in entry: sub_languages = ", ".join(entry["sub_languages"].keys()) @@ -105,37 +103,16 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - ) return entry.get(target_key) - # If there are sub-languages, check them too + # If there are sub-languages, check them too. if "sub_languages" in entry: for sub_language, sub_entry in entry["sub_languages"].items(): if sub_language.lower() == norm_source_value: return sub_entry.get(target_key) - # If no match was found, raise an error + # If no match was found, raise an error. raise ValueError(error_msg) -def get_scribe_languages() -> list[str]: - """ - Returns the list of currently implemented Scribe languages. - This version handles both regular languages and those with sub-languages (e.g., Norwegian). - """ - languages = [] - - for language, entry in _languages.items(): - # Add the main language (if it's directly queryable) - if "sub_languages" not in entry: - languages.append(language.capitalize()) - - # If there are sub-languages, add them instead - if "sub_languages" in entry: - languages.extend( - sub_language.capitalize() for sub_language in entry["sub_languages"] - ) - - return sorted(languages) - - def get_language_qid(language: str) -> str: """ Returns the QID of the given language. @@ -173,13 +150,12 @@ def get_language_iso(language: str) -> str: The ISO code for the language. """ - iso_code = _find( + return _find( "language", language, "iso", f"{language.upper()} is currently not a supported language for ISO conversion.", ) - return iso_code def get_language_from_iso(iso: str) -> str: @@ -433,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str: ---------- wikidata_gender : str The gender of the noun that was queried from WikiData. + + Returns + ------- + The gender value corrected in case the Wikidata ID was queried. """ gender_map = { - "masculine": "M", - "Q499327": "M", - "feminine": "F", - "Q1775415": "F", - "common gender": "C", - "Q1305037": "C", - "neuter": "N", - "Q1775461": "N", + "masculine": "masculine", + "Q499327": "masculine", + "feminine": "feminine", + "Q1775415": "feminine", + "common": "common", + "common gender": "common", + "Q1305037": "common", + "neuter": "neuter", + "Q1775461": "neuter", } return gender_map.get( - wikidata_gender, "" + wikidata_gender.lower(), "" ) # nouns could have a gender that is not a valid attribute @@ -458,20 +439,24 @@ def map_cases(wikidata_case: str) -> str: ---------- wikidata_case : str The case of the noun that was queried from WikiData. + + Returns + ------- + The case value corrected in case the Wikidata ID was queried. """ case_map = { - "accusative": "Acc", - "Q146078": "Acc", - "dative": "Dat", - "Q145599": "Dat", - "genitive": "Gen", - "Q146233": "Gen", - "instrumental": "Ins", - "Q192997": "Ins", - "prepositional": "Pre", - "Q2114906": "Pre", - "locative": "Loc", - "Q202142": "Loc", + "accusative": "accusative", + "Q146078": "accusative", + "dative": "dative", + "Q145599": "dative", + "genitive": "genitive", + "Q146233": "genitive", + "instrumental": "instrumental", + "Q192997": "instrumental", + "prepositional": "prepositional", + "Q2114906": "prepositional", + "locative": "locative", + "Q202142": "locative", } case = wikidata_case.split(" case")[0] return case_map.get(case, "") @@ -498,57 +483,66 @@ def order_annotations(annotation: str) -> str: def format_sublanguage_name(lang, language_metadata=_languages): """ Formats the name of a sub-language by appending its main language - in the format 'Mainlang/Sublang'. If the language is not a sub-language, + in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language, the original language name is returned as-is. - Args: - lang (str): The name of the language or sub-language to format. - language_metadata (dict): The metadata containing information about - main languages and their sub-languages. + Parameters + ---------- + lang : str + The name of the language or sub-language to format. - Returns: - str: The formatted language name if it's a sub-language - (e.g., 'Norwegian/Nynorsk'), otherwise the original name. + language_metadata : dict + The metadata containing information about main languages and their sub-languages. - Raises: + Returns + ------- + str + The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk'). + Otherwise the original name. + + Raises + ------ ValueError: If the provided language or sub-language is not found. - Example: - format_sublanguage_name("nynorsk", language_metadata) + Example + ------- + > format_sublanguage_name("nynorsk", language_metadata) 'Norwegian/Nynorsk' - format_sublanguage_name("english", language_metadata) + > format_sublanguage_name("english", language_metadata) 'English' """ - # Iterate through the main languages in the metadata for main_lang, lang_data in language_metadata.items(): - # If it's not a sub-language, return the original name + # If it's not a sub-language, return the original name. if main_lang == lang.lower(): return lang.capitalize() - # Check if the main language has sub-languages + + # Check if the main language has sub-languages. if "sub_languages" in lang_data: - # Check if the provided language is a sub-language + # Check if the provided language is a sub-language. for sub_lang in lang_data["sub_languages"]: if lang.lower() == sub_lang.lower(): - # Return the formatted name Mainlang/Sublang + # Return the formatted name MAIN_LANG/SUB_LANG. return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" - # Raise ValueError if no match is found + # Raise ValueError if no match is found. raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") def list_all_languages(language_metadata=_languages): - """List all languages from the provided metadata dictionary, including sub-languages.""" + """ + Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages. + """ current_languages = [] - # Iterate through the language metadata + # Iterate through the language metadata. for lang_key, lang_data in language_metadata.items(): - # Check if there are sub-languages + # Check if there are sub-languages. if "sub_languages" in lang_data: - # Add the sub-languages to current_languages + # Add the sub-languages to current_languages. current_languages.extend(lang_data["sub_languages"].keys()) else: - # If no sub-languages, add the main language + # If no sub-languages, add the main language. current_languages.append(lang_key) - return current_languages + return sorted(current_languages) diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index a827666a2..333c3b7d7 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.") + self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): @@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.") + self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): @@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.", + "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.", ) def test_validate_language_and_data_type_with_list(self): @@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self): data_types = ["nouns", "InvalidDataType"] with self.assertRaises(ValueError) as context: validate_language_and_data_type(languages, data_types) - self.assertIn("Invalid language InvalidLanguage", str(context.exception)) - self.assertIn("Invalid data-type InvalidDataType", str(context.exception)) + self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception)) + self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception)) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index df37317a3..43eaa2038 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -21,7 +21,6 @@ """ import sys -import unittest from pathlib import Path import pytest @@ -31,57 +30,6 @@ from scribe_data import utils -def test_get_scribe_languages(): - test_case = unittest.TestCase() - - # test for content, not order - test_case.assertCountEqual( - utils.get_scribe_languages(), - [ - "Arabic", - "Basque", - "Bengali", - "Bokmål", - "Czech", - "Danish", - "English", - "Esperanto", - "Estonian", - "Finnish", - "French", - "German", - "Greek", - "Gurmukhi", - "Hausa", - "Hebrew", - "Hindi", - "Indonesian", - "Italian", - "Japanese", - "Kurmanji", - "Latin", - "Malay", - "Malayalam", - "Mandarin", - "Nigerian", - "Nynorsk", - "Polish", - "Portuguese", - "Russian", - "Shahmukhi", - "Slovak", - "Spanish", - "Swahili", - "Swedish", - "Tajik", - "Tamil", - "Ukrainian", - "Urdu", - "Yoruba", - ], - ) - - @pytest.mark.parametrize( "language, qid_code", [ @@ -187,6 +135,7 @@ def test_list_all_languages(): "arabic", "basque", "bengali", + "bokmål", "czech", "danish", "english", @@ -196,10 +145,10 @@ def test_list_all_languages(): "french", "german", "greek", + "gurmukhi", "hausa", "hebrew", "hindi", - "urdu", "indonesian", "italian", "japanese", @@ -208,14 +157,12 @@ def test_list_all_languages(): "malay", "malayalam", "mandarin", - "nynorsk", - "bokmål", "nigerian", + "nynorsk", "polish", "portuguese", - "shahmukhi", - "gurmukhi", "russian", + "shahmukhi", "slovak", "spanish", "swahili", @@ -223,6 +170,7 @@ def test_list_all_languages(): "tajik", "tamil", "ukrainian", + "urdu", "yoruba", ]