diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index bab97a1a8..17c07e1c1 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
- [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
+- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)
---
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 376a954a7..2e44c618e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
- [First steps as a contributor](#first-steps)
- [Learning the tech stack](#learning-the-tech)
- [Development environment](#dev-env)
+- [Testing](#testing)
- [Issues and projects](#issues-projects)
- [Bug reports](#bug-reports)
- [Feature requests](#feature-requests)
@@ -171,6 +172,16 @@ pip install -e .
> [!NOTE]
> Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
+
+
+## Testing [`⇧`](#contents)
+
+In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:
+
+```bash
+pytest
+```
+
## Issues and projects [`⇧`](#contents)
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
index be2fa0f79..e39e1621d 100644
--- a/src/scribe_data/cli/cli_utils.py
+++ b/src/scribe_data/cli/cli_utils.py
@@ -27,6 +27,8 @@
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
+# MARK: CLI Variables
+
LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"
LANGUAGE_METADATA_FILE = (
@@ -56,20 +58,21 @@
language_map = {}
language_to_qid = {}
-# Process each language and its potential sub-languages in one pass
-for lang_key, lang_data in language_metadata.items():
- lang_key_lower = lang_key.lower()
+# Process each language and its potential sub-languages in one pass.
+for lang, lang_data in language_metadata.items():
+ lang_lower = lang.lower()
- # Handle sub-languages if they exist
+ # Handle sub-languages if they exist.
if "sub_languages" in lang_data:
- for sub_lang_key, sub_lang_data in lang_data["sub_languages"].items():
- sub_lang_key_lower = sub_lang_key.lower()
- language_map[sub_lang_key_lower] = sub_lang_data
- language_to_qid[sub_lang_key_lower] = sub_lang_data["qid"]
+ for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
+ sub_lang_lower = sub_lang.lower()
+ language_map[sub_lang_lower] = sub_lang_data
+ language_to_qid[sub_lang_lower] = sub_lang_data["qid"]
+
else:
- # Handle the main language directly
- language_map[lang_key_lower] = lang_data
- language_to_qid[lang_key_lower] = lang_data["qid"]
+ # Handle the main language directly.
+ language_map[lang_lower] = lang_data
+ language_to_qid[lang_lower] = lang_data["qid"]
# MARK: Correct Inputs
@@ -112,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
if isinstance(data, dict):
max_key_length = max((len(key) for key in data.keys()), default=0)
- if data_type == "autosuggestions":
- for key, value in data.items():
+ for key, value in data.items():
+ if data_type == "autosuggestions":
print(f"{key:<{max_key_length}} : {', '.join(value)}")
- elif data_type == "emoji_keywords":
- for key, value in data.items():
+ elif data_type == "emoji_keywords":
emojis = [item["emoji"] for item in value]
print(f"{key:<{max_key_length}} : {' '.join(emojis)}")
- elif data_type in {"prepositions"}:
- for key, value in data.items():
+ elif data_type in {"prepositions"}:
print(f"{key:<{max_key_length}} : {value}")
- else:
- for key, value in data.items():
- if isinstance(value, dict):
- print(f"{key:<{max_key_length}} : ")
- max_sub_key_length = max(
- (len(sub_key) for sub_key in value.keys()), default=0
- )
- for sub_key, sub_value in value.items():
- print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")
-
- elif isinstance(value, list):
- print(f"{key:<{max_key_length}} : ")
- for item in value:
- if isinstance(item, dict):
- for sub_key, sub_value in item.items():
- print(f" {sub_key:<{max_key_length}} : {sub_value}")
-
- else:
- print(f" {item}")
-
- else:
- print(f"{key:<{max_key_length}} : {value}")
+ elif isinstance(value, dict):
+ print(f"{key:<{max_key_length}} : ")
+ max_sub_key_length = max(
+ (len(sub_key) for sub_key in value.keys()), default=0
+ )
+ for sub_key, sub_value in value.items():
+ print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")
+
+ elif isinstance(value, list):
+ print(f"{key:<{max_key_length}} : ")
+ for item in value:
+ if isinstance(item, dict):
+ for sub_key, sub_value in item.items():
+ print(f" {sub_key:<{max_key_length}} : {sub_value}")
+
+ else:
+ print(f" {item}")
+
+ else:
+ print(f"{key:<{max_key_length}} : {value}")
elif isinstance(data, list):
for item in data:
@@ -211,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
):
closest_match = difflib.get_close_matches(item, valid_options, n=1)
closest_match_str = (
- f" The closest matching {item_type} is {closest_match[0]}."
+ f" The closest matching {item_type} is '{closest_match[0]}'."
if closest_match
else ""
)
- return f"Invalid {item_type} {item}.{closest_match_str}"
+ return f"Invalid {item_type} '{item}'.{closest_match_str}"
return None
diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index ee3311ede..762d3bfca 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -21,16 +21,16 @@
"""
from scribe_data.cli.cli_utils import (
+ LANGUAGE_DATA_EXTRACTION_DIR,
correct_data_type,
- language_metadata,
language_map,
- LANGUAGE_DATA_EXTRACTION_DIR,
+ language_metadata,
)
from scribe_data.utils import (
- list_all_languages,
+ format_sublanguage_name,
get_language_iso,
get_language_qid,
- format_sublanguage_name,
+ list_all_languages,
)
@@ -39,7 +39,6 @@ def list_languages() -> None:
Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
"""
languages = list_all_languages(language_metadata)
- languages.sort()
language_col_width = max(len(lang) for lang in languages) + 2
iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index 5530ef5db..885d9b3e9 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -29,8 +29,8 @@
language_metadata,
language_to_qid,
)
+from scribe_data.utils import format_sublanguage_name, list_all_languages
from scribe_data.wikidata.wikidata_utils import sparql
-from scribe_data.utils import list_all_languages, format_sublanguage_name
def get_qid_by_input(input_str):
@@ -73,9 +73,8 @@ def get_datatype_list(language):
A list of the corresponding data types.
"""
languages = list_all_languages(language_metadata)
- language_list = [lang for lang in languages]
- if language.lower() in language_list:
+ if language.lower() in languages:
language_data = language_map.get(language.lower())
language_capitalized = format_sublanguage_name(
language, language_metadata
@@ -134,13 +133,9 @@ def print_total_lexemes(language: str = None):
print("=" * 64)
if language is None: # all languages
- languages = list_all_languages(
- language_metadata
- ) # this returns a list of language names
- language_list = languages # sorts the list in place
- language_list.sort()
+ languages = list_all_languages(language_metadata)
- for lang in language_list:
+ for lang in languages:
data_types = get_datatype_list(lang)
first_row = True
diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index 00a8d405c..7ab2145bf 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -11,6 +11,14 @@
"iso": "bn",
"qid": "Q9610"
},
+ "chinese": {
+ "sub_languages": {
+ "mandarin": {
+ "iso": "zh",
+ "qid": "Q727694"
+ }
+ }
+ },
"czech": {
"iso": "cs",
"qid": "Q9056"
@@ -95,23 +103,15 @@
"iso": "ml",
"qid": "Q36236"
},
- "chinese": {
- "sub_languages": {
- "mandarin": {
- "iso": "zh",
- "qid": "Q727694"
- }
- }
- },
"norwegian": {
"sub_languages": {
- "nynorsk": {
- "iso": "nn",
- "qid": "Q25164"
- },
"bokmål": {
"iso": "nb",
"qid": "Q25167"
+ },
+ "nynorsk": {
+ "iso": "nn",
+ "qid": "Q25164"
}
}
},
@@ -133,13 +133,13 @@
},
"punjabi": {
"sub_languages": {
- "shahmukhi": {
- "iso": "pnb",
- "qid": "Q58635"
- },
"gurmukhi": {
"iso": "pa",
"qid": "Q58635"
+ },
+ "shahmukhi": {
+ "iso": "pnb",
+ "qid": "Q58635"
}
}
},
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index df22a9a9a..3c2007640 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -26,7 +26,6 @@
from pathlib import Path
from typing import Any, Optional
-
PROJECT_ROOT = "Scribe-Data"
DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export"
DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export"
@@ -53,8 +52,7 @@ def _load_json(package_path: str, file_name: str) -> Any:
with resources.files(package_path).joinpath(file_name).open(
encoding="utf-8"
) as in_stream:
- contents = json.load(in_stream)
- return contents # No need for 'root'
+ return json.load(in_stream)
_languages = _load_json(
@@ -90,13 +88,13 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
------
ValueError : when a source_value is not supported or the language only has sub-languages.
"""
- norm_source_value = source_value.lower()
-
- # Check if we're searching by language name
+ # Check if we're searching by language name.
if source_key == "language":
- # First, check the main language entries (e.g., mandarin, french, etc.)
+ norm_source_value = source_value.lower()
+
+ # First, check the main language entries (e.g., mandarin, french, etc.).
for language, entry in _languages.items():
- # If the language name matches the top-level key, return the target value
+ # If the language name matches the top-level key, return the target value.
if language.lower() == norm_source_value:
if "sub_languages" in entry:
sub_languages = ", ".join(entry["sub_languages"].keys())
@@ -105,37 +103,16 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
)
return entry.get(target_key)
- # If there are sub-languages, check them too
+ # If there are sub-languages, check them too.
if "sub_languages" in entry:
for sub_language, sub_entry in entry["sub_languages"].items():
if sub_language.lower() == norm_source_value:
return sub_entry.get(target_key)
- # If no match was found, raise an error
+ # If no match was found, raise an error.
raise ValueError(error_msg)
-def get_scribe_languages() -> list[str]:
- """
- Returns the list of currently implemented Scribe languages.
- This version handles both regular languages and those with sub-languages (e.g., Norwegian).
- """
- languages = []
-
- for language, entry in _languages.items():
- # Add the main language (if it's directly queryable)
- if "sub_languages" not in entry:
- languages.append(language.capitalize())
-
- # If there are sub-languages, add them instead
- if "sub_languages" in entry:
- languages.extend(
- sub_language.capitalize() for sub_language in entry["sub_languages"]
- )
-
- return sorted(languages)
-
-
def get_language_qid(language: str) -> str:
"""
Returns the QID of the given language.
@@ -173,13 +150,12 @@ def get_language_iso(language: str) -> str:
The ISO code for the language.
"""
- iso_code = _find(
+ return _find(
"language",
language,
"iso",
f"{language.upper()} is currently not a supported language for ISO conversion.",
)
- return iso_code
def get_language_from_iso(iso: str) -> str:
@@ -433,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str:
----------
wikidata_gender : str
The gender of the noun that was queried from WikiData.
+
+ Returns
+ -------
+ The gender value corrected in case the Wikidata ID was queried.
"""
gender_map = {
- "masculine": "M",
- "Q499327": "M",
- "feminine": "F",
- "Q1775415": "F",
- "common gender": "C",
- "Q1305037": "C",
- "neuter": "N",
- "Q1775461": "N",
+ "masculine": "masculine",
+ "Q499327": "masculine",
+ "feminine": "feminine",
+ "Q1775415": "feminine",
+ "common": "common",
+ "common gender": "common",
+ "Q1305037": "common",
+ "neuter": "neuter",
+ "Q1775461": "neuter",
}
return gender_map.get(
- wikidata_gender, ""
+ wikidata_gender.lower(), ""
) # nouns could have a gender that is not a valid attribute
@@ -458,20 +439,24 @@ def map_cases(wikidata_case: str) -> str:
----------
wikidata_case : str
The case of the noun that was queried from WikiData.
+
+ Returns
+ -------
+ The case value corrected in case the Wikidata ID was queried.
"""
case_map = {
- "accusative": "Acc",
- "Q146078": "Acc",
- "dative": "Dat",
- "Q145599": "Dat",
- "genitive": "Gen",
- "Q146233": "Gen",
- "instrumental": "Ins",
- "Q192997": "Ins",
- "prepositional": "Pre",
- "Q2114906": "Pre",
- "locative": "Loc",
- "Q202142": "Loc",
+ "accusative": "accusative",
+ "Q146078": "accusative",
+ "dative": "dative",
+ "Q145599": "dative",
+ "genitive": "genitive",
+ "Q146233": "genitive",
+ "instrumental": "instrumental",
+ "Q192997": "instrumental",
+ "prepositional": "prepositional",
+ "Q2114906": "prepositional",
+ "locative": "locative",
+ "Q202142": "locative",
}
case = wikidata_case.split(" case")[0]
return case_map.get(case, "")
@@ -498,57 +483,66 @@ def order_annotations(annotation: str) -> str:
def format_sublanguage_name(lang, language_metadata=_languages):
"""
Formats the name of a sub-language by appending its main language
- in the format 'Mainlang/Sublang'. If the language is not a sub-language,
+ in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language,
the original language name is returned as-is.
- Args:
- lang (str): The name of the language or sub-language to format.
- language_metadata (dict): The metadata containing information about
- main languages and their sub-languages.
+ Parameters
+ ----------
+ lang : str
+ The name of the language or sub-language to format.
- Returns:
- str: The formatted language name if it's a sub-language
- (e.g., 'Norwegian/Nynorsk'), otherwise the original name.
+ language_metadata : dict
+ The metadata containing information about main languages and their sub-languages.
- Raises:
+ Returns
+ -------
+ str
+ The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk').
+ Otherwise the original name.
+
+ Raises
+ ------
ValueError: If the provided language or sub-language is not found.
- Example:
- format_sublanguage_name("nynorsk", language_metadata)
+ Example
+ -------
+ > format_sublanguage_name("nynorsk", language_metadata)
'Norwegian/Nynorsk'
- format_sublanguage_name("english", language_metadata)
+ > format_sublanguage_name("english", language_metadata)
'English'
"""
- # Iterate through the main languages in the metadata
for main_lang, lang_data in language_metadata.items():
- # If it's not a sub-language, return the original name
+ # If it's not a sub-language, return the original name.
if main_lang == lang.lower():
return lang.capitalize()
- # Check if the main language has sub-languages
+
+ # Check if the main language has sub-languages.
if "sub_languages" in lang_data:
- # Check if the provided language is a sub-language
+ # Check if the provided language is a sub-language.
for sub_lang in lang_data["sub_languages"]:
if lang.lower() == sub_lang.lower():
- # Return the formatted name Mainlang/Sublang
+ # Return the formatted name MAIN_LANG/SUB_LANG.
return f"{main_lang.capitalize()}/{sub_lang.capitalize()}"
- # Raise ValueError if no match is found
+ # Raise ValueError if no match is found.
raise ValueError(f"{lang.upper()} is not a valid language or sub-language.")
def list_all_languages(language_metadata=_languages):
- """List all languages from the provided metadata dictionary, including sub-languages."""
+ """
+ Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages.
+ """
current_languages = []
- # Iterate through the language metadata
+ # Iterate through the language metadata.
for lang_key, lang_data in language_metadata.items():
- # Check if there are sub-languages
+ # Check if there are sub-languages.
if "sub_languages" in lang_data:
- # Add the sub-languages to current_languages
+ # Add the sub-languages to current_languages.
current_languages.extend(lang_data["sub_languages"].keys())
else:
- # If no sub-languages, add the main language
+ # If no sub-languages, add the main language.
current_languages.append(lang_key)
- return current_languages
+ return sorted(current_languages)
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index a827666a2..333c3b7d7 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid):
language=language_qid, data_type=data_type_qid
)
- self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.")
+ self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.")
@patch("scribe_data.cli.total.get_qid_by_input")
def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
@@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
language=language_qid, data_type=data_type_qid
)
- self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.")
+ self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.")
@patch("scribe_data.cli.total.get_qid_by_input")
def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
@@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
self.assertEqual(
str(context.exception),
- "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.",
+ "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.",
)
def test_validate_language_and_data_type_with_list(self):
@@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self):
data_types = ["nouns", "InvalidDataType"]
with self.assertRaises(ValueError) as context:
validate_language_and_data_type(languages, data_types)
- self.assertIn("Invalid language InvalidLanguage", str(context.exception))
- self.assertIn("Invalid data-type InvalidDataType", str(context.exception))
+ self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception))
+ self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception))
diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index df37317a3..43eaa2038 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -21,7 +21,6 @@
"""
import sys
-import unittest
from pathlib import Path
import pytest
@@ -31,57 +30,6 @@
from scribe_data import utils
-def test_get_scribe_languages():
- test_case = unittest.TestCase()
-
- # test for content, not order
- test_case.assertCountEqual(
- utils.get_scribe_languages(),
- [
- "Arabic",
- "Basque",
- "Bengali",
- "Bokmål",
- "Czech",
- "Danish",
- "English",
- "Esperanto",
- "Estonian",
- "Finnish",
- "French",
- "German",
- "Greek",
- "Gurmukhi",
- "Hausa",
- "Hebrew",
- "Hindi",
- "Indonesian",
- "Italian",
- "Japanese",
- "Kurmanji",
- "Latin",
- "Malay",
- "Malayalam",
- "Mandarin",
- "Nigerian",
- "Nynorsk",
- "Polish",
- "Portuguese",
- "Russian",
- "Shahmukhi",
- "Slovak",
- "Spanish",
- "Swahili",
- "Swedish",
- "Tajik",
- "Tamil",
- "Ukrainian",
- "Urdu",
- "Yoruba",
- ],
- )
-
-
@pytest.mark.parametrize(
"language, qid_code",
[
@@ -187,6 +135,7 @@ def test_list_all_languages():
"arabic",
"basque",
"bengali",
+ "bokmål",
"czech",
"danish",
"english",
@@ -196,10 +145,10 @@ def test_list_all_languages():
"french",
"german",
"greek",
+ "gurmukhi",
"hausa",
"hebrew",
"hindi",
- "urdu",
"indonesian",
"italian",
"japanese",
@@ -208,14 +157,12 @@ def test_list_all_languages():
"malay",
"malayalam",
"mandarin",
- "nynorsk",
- "bokmål",
"nigerian",
+ "nynorsk",
"polish",
"portuguese",
- "shahmukhi",
- "gurmukhi",
"russian",
+ "shahmukhi",
"slovak",
"spanish",
"swahili",
@@ -223,6 +170,7 @@ def test_list_all_languages():
"tajik",
"tamil",
"ukrainian",
+ "urdu",
"yoruba",
]