diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index bab97a1a8..17c07e1c1 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,6 +7,7 @@ Thank you for your pull request! 🚀
- [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch
+- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing)
---
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 376a954a7..2e44c618e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u
- [First steps as a contributor](#first-steps)
- [Learning the tech stack](#learning-the-tech)
- [Development environment](#dev-env)
+- [Testing](#testing)
- [Issues and projects](#issues-projects)
- [Bug reports](#bug-reports)
- [Feature requests](#feature-requests)
@@ -171,6 +172,16 @@ pip install -e .
> [!NOTE]
> Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup!
+
+
+## Testing [`⇧`](#contents)
+
+In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root:
+
+```bash
+pytest
+```
+
## Issues and projects [`⇧`](#contents)
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
index 4f59a65ef..e39e1621d 100644
--- a/src/scribe_data/cli/cli_utils.py
+++ b/src/scribe_data/cli/cli_utils.py
@@ -27,6 +27,8 @@
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
+# MARK: CLI Variables
+
LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction"
LANGUAGE_METADATA_FILE = (
@@ -53,14 +55,24 @@
print(f"Error reading data type metadata: {e}")
-language_map = {
- lang["language"].lower(): lang for lang in language_metadata["languages"]
-}
+language_map = {}
+language_to_qid = {}
+
+# Process each language and its potential sub-languages in one pass.
+for lang, lang_data in language_metadata.items():
+ lang_lower = lang.lower()
-# Create language_to_qid dictionary.
-language_to_qid = {
- lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"]
-}
+ # Handle sub-languages if they exist.
+ if "sub_languages" in lang_data:
+ for sub_lang, sub_lang_data in lang_data["sub_languages"].items():
+ sub_lang_lower = sub_lang.lower()
+ language_map[sub_lang_lower] = sub_lang_data
+ language_to_qid[sub_lang_lower] = sub_lang_data["qid"]
+
+ else:
+ # Handle the main language directly.
+ language_map[lang_lower] = lang_data
+ language_to_qid[lang_lower] = lang_data["qid"]
# MARK: Correct Inputs
@@ -103,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None:
if isinstance(data, dict):
max_key_length = max((len(key) for key in data.keys()), default=0)
- if data_type == "autosuggestions":
- for key, value in data.items():
+ for key, value in data.items():
+ if data_type == "autosuggestions":
print(f"{key:<{max_key_length}} : {', '.join(value)}")
- elif data_type == "emoji_keywords":
- for key, value in data.items():
+ elif data_type == "emoji_keywords":
emojis = [item["emoji"] for item in value]
print(f"{key:<{max_key_length}} : {' '.join(emojis)}")
- elif data_type in {"prepositions"}:
- for key, value in data.items():
+ elif data_type in {"prepositions"}:
print(f"{key:<{max_key_length}} : {value}")
- else:
- for key, value in data.items():
- if isinstance(value, dict):
- print(f"{key:<{max_key_length}} : ")
- max_sub_key_length = max(
- (len(sub_key) for sub_key in value.keys()), default=0
- )
- for sub_key, sub_value in value.items():
- print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")
-
- elif isinstance(value, list):
- print(f"{key:<{max_key_length}} : ")
- for item in value:
- if isinstance(item, dict):
- for sub_key, sub_value in item.items():
- print(f" {sub_key:<{max_key_length}} : {sub_value}")
-
- else:
- print(f" {item}")
-
- else:
- print(f"{key:<{max_key_length}} : {value}")
+ elif isinstance(value, dict):
+ print(f"{key:<{max_key_length}} : ")
+ max_sub_key_length = max(
+ (len(sub_key) for sub_key in value.keys()), default=0
+ )
+ for sub_key, sub_value in value.items():
+ print(f" {sub_key:<{max_sub_key_length}} : {sub_value}")
+
+ elif isinstance(value, list):
+ print(f"{key:<{max_key_length}} : ")
+ for item in value:
+ if isinstance(item, dict):
+ for sub_key, sub_value in item.items():
+ print(f" {sub_key:<{max_key_length}} : {sub_value}")
+
+ else:
+ print(f" {item}")
+
+ else:
+ print(f"{key:<{max_key_length}} : {value}")
elif isinstance(data, list):
for item in data:
@@ -202,12 +210,12 @@ def validate_single_item(item, valid_options, item_type):
):
closest_match = difflib.get_close_matches(item, valid_options, n=1)
closest_match_str = (
- f" The closest matching {item_type} is {closest_match[0]}."
+ f" The closest matching {item_type} is '{closest_match[0]}'."
if closest_match
else ""
)
- return f"Invalid {item_type} {item}.{closest_match_str}"
+ return f"Invalid {item_type} '{item}'.{closest_match_str}"
return None
diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py
index 4e95f34b0..6ba7a1f55 100644
--- a/src/scribe_data/cli/interactive.py
+++ b/src/scribe_data/cli/interactive.py
@@ -35,7 +35,7 @@
from scribe_data.cli.cli_utils import data_type_metadata, language_metadata
from scribe_data.cli.get import get_data
from scribe_data.cli.version import get_version_message
-from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
+from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages
# MARK: Config Setup
@@ -51,9 +51,7 @@
class ScribeDataConfig:
def __init__(self):
- self.languages = [
- lang["language"].capitalize() for lang in language_metadata["languages"]
- ]
+ self.languages = list_all_languages(language_metadata)
self.data_types = list(data_type_metadata.keys())
self.selected_languages: List[str] = []
self.selected_data_types: List[str] = []
diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
index 5d16b4413..762d3bfca 100644
--- a/src/scribe_data/cli/list.py
+++ b/src/scribe_data/cli/list.py
@@ -21,10 +21,16 @@
"""
from scribe_data.cli.cli_utils import (
+ LANGUAGE_DATA_EXTRACTION_DIR,
correct_data_type,
- language_metadata,
language_map,
- LANGUAGE_DATA_EXTRACTION_DIR,
+ language_metadata,
+)
+from scribe_data.utils import (
+ format_sublanguage_name,
+ get_language_iso,
+ get_language_qid,
+ list_all_languages,
)
@@ -32,12 +38,11 @@ def list_languages() -> None:
"""
Generates a table of languages, their ISO-2 codes and their Wikidata QIDs.
"""
- languages = list(language_metadata["languages"])
- languages.sort(key=lambda x: x["language"])
+ languages = list_all_languages(language_metadata)
- language_col_width = max(len(lang["language"]) for lang in languages) + 2
- iso_col_width = max(len(lang["iso"]) for lang in languages) + 2
- qid_col_width = max(len(lang["qid"]) for lang in languages) + 2
+ language_col_width = max(len(lang) for lang in languages) + 2
+ iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2
+ qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2
table_line_length = language_col_width + iso_col_width + qid_col_width
@@ -49,7 +54,7 @@ def list_languages() -> None:
for lang in languages:
print(
- f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}"
+ f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}"
)
print("-" * table_line_length)
@@ -65,7 +70,9 @@ def list_data_types(language: str = None) -> None:
language : str
The language to potentially list data types for.
"""
+ languages = list_all_languages(language_metadata)
if language:
+ language = format_sublanguage_name(language, language_metadata)
language_data = language_map.get(language.lower())
language_capitalized = language.capitalize()
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized
@@ -83,8 +90,11 @@ def list_data_types(language: str = None) -> None:
else:
data_types = set()
- for lang in language_metadata["languages"]:
- language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
+ for lang in languages:
+ language_dir = (
+ LANGUAGE_DATA_EXTRACTION_DIR
+ / format_sublanguage_name(lang, language_metadata).capitalize()
+ )
if language_dir.is_dir():
data_types.update(f.name for f in language_dir.iterdir() if f.is_dir())
@@ -122,13 +132,15 @@ def list_languages_for_data_type(data_type: str) -> None:
The data type to check for.
"""
data_type = correct_data_type(data_type=data_type)
+ all_languages = list_all_languages(language_metadata)
available_languages = []
- for lang in language_metadata["languages"]:
- language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize()
+ for lang in all_languages:
+ lang = format_sublanguage_name(lang, language_metadata)
+ language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang
if language_dir.is_dir():
dt_path = language_dir / data_type
if dt_path.exists():
- available_languages.append(lang["language"])
+ available_languages.append(lang)
available_languages.sort()
table_header = f"Available languages: {data_type}"
@@ -141,7 +153,7 @@ def list_languages_for_data_type(data_type: str) -> None:
print("-" * table_line_length)
for lang in available_languages:
- print(f"{lang.capitalize()}")
+ print(f"{lang}")
print("-" * table_line_length)
print()
diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
index fe1382707..885d9b3e9 100644
--- a/src/scribe_data/cli/total.py
+++ b/src/scribe_data/cli/total.py
@@ -29,6 +29,7 @@
language_metadata,
language_to_qid,
)
+from scribe_data.utils import format_sublanguage_name, list_all_languages
from scribe_data.wikidata.wikidata_utils import sparql
@@ -71,12 +72,13 @@ def get_datatype_list(language):
data_types : list[str] or None
A list of the corresponding data types.
"""
- languages = list(language_metadata["languages"])
- language_list = [lang["language"] for lang in languages]
+ languages = list_all_languages(language_metadata)
- if language.lower() in language_list:
+ if language.lower() in languages:
language_data = language_map.get(language.lower())
- language_capitalized = language.capitalize()
+ language_capitalized = format_sublanguage_name(
+ language, language_metadata
+ ).capitalize()
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized
if not language_data:
@@ -131,11 +133,9 @@ def print_total_lexemes(language: str = None):
print("=" * 64)
if language is None: # all languages
- languages = list(language_metadata["languages"])
- languages.sort(key=lambda x: x["language"])
- language_list = [lang["language"] for lang in languages]
+ languages = list_all_languages(language_metadata)
- for lang in language_list:
+ for lang in languages:
data_types = get_datatype_list(lang)
first_row = True
diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py
index 79d19e39b..aec1f9560 100644
--- a/src/scribe_data/load/data_to_sqlite.py
+++ b/src/scribe_data/load/data_to_sqlite.py
@@ -35,6 +35,7 @@
DEFAULT_SQLITE_EXPORT_DIR,
get_language_iso,
)
+from scribe_data.utils import list_all_languages
def data_to_sqlite(
@@ -52,8 +53,7 @@ def data_to_sqlite(
current_language_data = json.load(f_languages)
data_types = json.load(f_data_types).keys()
- current_languages = [d["language"] for d in current_language_data["languages"]]
-
+ current_languages = list_all_languages(current_language_data)
if not languages:
languages = current_languages
diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
index e6d7de8a6..7ab2145bf 100755
--- a/src/scribe_data/resources/language_metadata.json
+++ b/src/scribe_data/resources/language_metadata.json
@@ -1,70 +1,182 @@
{
- "used by": "Scribe-Data/src/scribe_data/utils.py",
- "description": {
- "entry": {
- "language": "the supported language. All lowercase",
- "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase",
- "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390",
- "remove-words": "words that should not be included as autosuggestions for the given language.",
- "ignore-words": "words that should be removed from the autosuggestion generation process."
+ "arabic": {
+ "iso": "ar",
+ "qid": "Q13955"
+ },
+ "basque": {
+ "iso": "eu",
+ "qid": "Q8752"
+ },
+ "bengali": {
+ "iso": "bn",
+ "qid": "Q9610"
+ },
+ "chinese": {
+ "sub_languages": {
+ "mandarin": {
+ "iso": "zh",
+ "qid": "Q727694"
+ }
+ }
+ },
+ "czech": {
+ "iso": "cs",
+ "qid": "Q9056"
+ },
+ "danish": {
+ "iso": "da",
+ "qid": "Q9035"
+ },
+ "english": {
+ "iso": "en",
+ "qid": "Q1860"
+ },
+ "esperanto": {
+ "iso": "eo",
+ "qid": "Q143"
+ },
+ "estonian": {
+ "iso": "et",
+ "qid": "Q9072"
+ },
+ "finnish": {
+ "iso": "fi",
+ "qid": "Q1412"
+ },
+ "french": {
+ "iso": "fr",
+ "qid": "Q150"
+ },
+ "german": {
+ "iso": "de",
+ "qid": "Q188"
+ },
+ "greek": {
+ "iso": "el",
+ "qid": "Q36510"
+ },
+ "hausa": {
+ "iso": "ha",
+ "qid": "Q56475"
+ },
+ "hebrew": {
+ "iso": "he",
+ "qid": "Q9288"
+ },
+ "hindustani": {
+ "sub_languages": {
+ "hindi": {
+ "iso": "hi",
+ "qid": "Q11051"
+ },
+ "urdu": {
+ "iso": "ur",
+ "qid": "Q11051"
+ }
+ }
+ },
+ "indonesian": {
+ "iso": "id",
+ "qid": "Q9240"
+ },
+ "italian": {
+ "iso": "it",
+ "qid": "Q652"
+ },
+ "japanese": {
+ "iso": "ja",
+ "qid": "Q5287"
+ },
+ "kurmanji": {
+ "iso": "kmr",
+ "qid": "Q36163"
+ },
+ "latin": {
+ "iso": "la",
+ "qid": "Q397"
+ },
+ "malay": {
+ "iso": "ms",
+ "qid": "Q9237"
+ },
+ "malayalam": {
+ "iso": "ml",
+ "qid": "Q36236"
+ },
+ "norwegian": {
+ "sub_languages": {
+ "bokmål": {
+ "iso": "nb",
+ "qid": "Q25167"
+ },
+ "nynorsk": {
+ "iso": "nn",
+ "qid": "Q25164"
+ }
+ }
+ },
+ "pidgin": {
+ "sub_languages": {
+ "nigerian": {
+ "iso": "pi",
+ "qid": "Q33655"
+ }
}
},
- "languages": [
- {
- "language": "english",
- "iso": "en",
- "qid": "Q1860",
- "remove-words": ["of", "the", "The", "and"],
- "ignore-words": []
- },
- {
- "language": "french",
- "iso": "fr",
- "qid": "Q150",
- "remove-words": ["of", "the", "The", "and"],
- "ignore-words": ["XXe"]
- },
- {
- "language": "german",
- "iso": "de",
- "qid": "Q188",
- "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"],
- "ignore-words": ["Gemeinde", "Familienname"]
- },
- {
- "language": "italian",
- "iso": "it",
- "qid": "Q652",
- "remove-words": ["of", "the", "The", "and", "text", "from"],
- "ignore-words": ["The", "ATP"]
- },
- {
- "language": "portuguese",
- "iso": "pt",
- "qid": "Q5146",
- "remove-words": ["of", "the", "The", "and", "jbutadptflora"],
- "ignore-words": []
- },
- {
- "language": "russian",
- "iso": "ru",
- "qid": "Q7737",
- "remove-words": ["of", "the", "The", "and"],
- "ignore-words": []
- },
- {
- "language": "spanish",
- "iso": "es",
- "qid": "Q1321",
- "remove-words": ["of", "the", "The", "and"],
- "ignore-words": []
- },
- {
- "language": "swedish",
- "iso": "sv",
- "qid": "Q9027",
- "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"],
- "ignore-words": ["databasdump"]
+ "polish": {
+ "iso": "pl",
+ "qid": "Q809"
+ },
+ "portuguese": {
+ "iso": "pt",
+ "qid": "Q5146"
+ },
+ "punjabi": {
+ "sub_languages": {
+ "gurmukhi": {
+ "iso": "pa",
+ "qid": "Q58635"
+ },
+ "shahmukhi": {
+ "iso": "pnb",
+ "qid": "Q58635"
+ }
}
- ]
+ },
+ "russian": {
+ "iso": "ru",
+ "qid": "Q7737"
+ },
+ "slovak": {
+ "iso": "sk",
+ "qid": "Q9058"
+ },
+ "spanish": {
+ "iso": "es",
+ "qid": "Q1321"
+ },
+ "swahili": {
+ "iso": "sw",
+ "qid": "Q7838"
+ },
+ "swedish": {
+ "iso": "sv",
+ "qid": "Q9027"
+ },
+ "tajik": {
+ "iso": "tg",
+ "qid": "Q9260"
+ },
+ "tamil": {
+ "iso": "ta",
+ "qid": "Q5885"
+ },
+ "ukrainian": {
+ "iso": "ua",
+ "qid": "Q8798"
+ },
+ "yoruba": {
+ "iso": "yo",
+ "qid": "Q34311"
+ }
}
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 9d94485ab..3c2007640 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -26,9 +26,6 @@
from pathlib import Path
from typing import Any, Optional
-from iso639 import Lang
-from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue
-
PROJECT_ROOT = "Scribe-Data"
DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export"
DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export"
@@ -36,7 +33,7 @@
DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export"
-def _load_json(package_path: str, file_name: str, root: str) -> Any:
+def _load_json(package_path: str, file_name: str) -> Any:
"""
Loads a JSON resource from a package into a python entity.
@@ -48,52 +45,37 @@ def _load_json(package_path: str, file_name: str, root: str) -> Any:
file_name : str
The name of the file (resource) that contains the JSON data.
- root : str
- The root node of the JSON document.
-
Returns
-------
- A python entity starting at 'root'.
+ A python entity representing the JSON content.
"""
-
with resources.files(package_path).joinpath(file_name).open(
encoding="utf-8"
) as in_stream:
- contents = json.load(in_stream)
- return contents[root]
+ return json.load(in_stream)
_languages = _load_json(
- package_path="scribe_data.resources",
- file_name="language_metadata.json",
- root="languages",
+ package_path="scribe_data.resources", file_name="language_metadata.json"
)
def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -> Any:
"""
- Each 'language', (english, german,..., etc) is a dictionary of key/value pairs:
+ Finds a target value based on a source key/value pair from the language metadata.
- entry = {
- "language": "english",
- "iso": "en",
- "qid": "Q1860",
- "remove-words": [...],
- "ignore-words": [...]
- }
-
- Given a key/value pair, the 'source' and the 'target' key get the 'target' value.
+ This version handles both regular languages and those with sub-languages (e.g., Norwegian).
Parameters
----------
source_value : str
- The source value to find equivalents for (e.g. 'english').
+ The source value to find equivalents for (e.g., 'english', 'nynorsk').
source_key : str
- The source key to reference (e.g. 'language').
+ The source key to reference (e.g., 'language').
target_key : str
- The key to target (e.g. 'iso').
+ The key to target (e.g., 'qid').
error_msg : str
The message displayed when a value cannot be found.
@@ -104,28 +86,33 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -
Raises
------
- ValueError : when a source_value is not supported.
- """
- norm_source_value = source_value.lower()
-
- if target_value := [
- entry[target_key]
- for entry in _languages
- if entry[source_key] == norm_source_value
- ]:
- assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'"
- return target_value[0]
-
+ ValueError : when a source_value is not supported or the language only has sub-languages.
+ """
+ # Check if we're searching by language name.
+ if source_key == "language":
+ norm_source_value = source_value.lower()
+
+ # First, check the main language entries (e.g., mandarin, french, etc.).
+ for language, entry in _languages.items():
+ # If the language name matches the top-level key, return the target value.
+ if language.lower() == norm_source_value:
+ if "sub_languages" in entry:
+ sub_languages = ", ".join(entry["sub_languages"].keys())
+ raise ValueError(
+ f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}"
+ )
+ return entry.get(target_key)
+
+ # If there are sub-languages, check them too.
+ if "sub_languages" in entry:
+ for sub_language, sub_entry in entry["sub_languages"].items():
+ if sub_language.lower() == norm_source_value:
+ return sub_entry.get(target_key)
+
+ # If no match was found, raise an error.
raise ValueError(error_msg)
-def get_scribe_languages() -> list[str]:
- """
- Returns the list of currently implemented Scribe languages.
- """
- return sorted(entry["language"].capitalize() for entry in _languages)
-
-
def get_language_qid(language: str) -> str:
"""
Returns the QID of the given language.
@@ -162,13 +149,13 @@ def get_language_iso(language: str) -> str:
str
The ISO code for the language.
"""
- try:
- iso_code = str(Lang(language.capitalize()).pt1)
- except InvalidLanguageValue:
- raise ValueError(
- f"{language.capitalize()} is currently not a supported language for ISO conversion."
- ) from None
- return iso_code
+
+ return _find(
+ "language",
+ language,
+ "iso",
+ f"{language.upper()} is currently not a supported language for ISO conversion.",
+ )
def get_language_from_iso(iso: str) -> str:
@@ -185,57 +172,20 @@ def get_language_from_iso(iso: str) -> str:
str
The name for the language which has an ISO value of iso.
"""
- try:
- language_name = str(Lang(iso.lower()).name)
- except DeprecatedLanguageValue as e:
- raise ValueError(
- f"{iso.upper()} is currently not a supported ISO language."
- ) from e
- return language_name
-
-
-def get_language_words_to_remove(language: str) -> list[str]:
- """
- Returns the words that should be removed during the data cleaning process for the given language.
-
- Parameters
- ----------
- language : str
- The language the words should be returned for.
-
- Returns
- -------
- list[str]
- The words that that be removed during the data cleaning process for the given language.
- """
- return _find(
- "language",
- language,
- "remove-words",
- f"{language.capitalize()} is currently not a supported language.",
- )
-
-
-def get_language_words_to_ignore(language: str) -> list[str]:
- """
- Returns the words that should not be included as autosuggestions for the given language.
+ # Iterate over the languages and their properties
+ for language, properties in _languages.items():
+ # Check if the current language's ISO matches the provided ISO
+ if properties.get("iso") == iso:
+ return language.capitalize()
- Parameters
- ----------
- language : str
- The language the words should be returned for.
+ # If there are sub-languages, check those as well
+ if "sub_languages" in properties:
+ for sub_lang, sub_properties in properties["sub_languages"].items():
+ if sub_properties.get("iso") == iso:
+ return sub_lang.capitalize()
- Returns
- -------
- list[str]
- The words that should not be included as autosuggestions for the given language.
- """
- return _find(
- "language",
- language,
- "ignore-words",
- f"{language.capitalize()} is currently not a supported language.",
- )
+ # If no match is found, raise a ValueError
+ raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")
def load_queried_data(
@@ -459,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str:
----------
wikidata_gender : str
The gender of the noun that was queried from WikiData.
+
+ Returns
+ -------
+ The gender value corrected in case the Wikidata ID was queried.
"""
gender_map = {
- "masculine": "M",
- "Q499327": "M",
- "feminine": "F",
- "Q1775415": "F",
- "common gender": "C",
- "Q1305037": "C",
- "neuter": "N",
- "Q1775461": "N",
+ "masculine": "masculine",
+ "Q499327": "masculine",
+ "feminine": "feminine",
+ "Q1775415": "feminine",
+ "common": "common",
+ "common gender": "common",
+ "Q1305037": "common",
+ "neuter": "neuter",
+ "Q1775461": "neuter",
}
return gender_map.get(
- wikidata_gender, ""
+ wikidata_gender.lower(), ""
) # nouns could have a gender that is not a valid attribute
@@ -484,20 +439,24 @@ def map_cases(wikidata_case: str) -> str:
----------
wikidata_case : str
The case of the noun that was queried from WikiData.
+
+ Returns
+ -------
+ The case value corrected in case the Wikidata ID was queried.
"""
case_map = {
- "accusative": "Acc",
- "Q146078": "Acc",
- "dative": "Dat",
- "Q145599": "Dat",
- "genitive": "Gen",
- "Q146233": "Gen",
- "instrumental": "Ins",
- "Q192997": "Ins",
- "prepositional": "Pre",
- "Q2114906": "Pre",
- "locative": "Loc",
- "Q202142": "Loc",
+ "accusative": "accusative",
+ "Q146078": "accusative",
+ "dative": "dative",
+ "Q145599": "dative",
+ "genitive": "genitive",
+ "Q146233": "genitive",
+ "instrumental": "instrumental",
+ "Q192997": "instrumental",
+ "prepositional": "prepositional",
+ "Q2114906": "prepositional",
+ "locative": "locative",
+ "Q202142": "locative",
}
case = wikidata_case.split(" case")[0]
return case_map.get(case, "")
@@ -519,3 +478,71 @@ def order_annotations(annotation: str) -> str:
annotation_split = sorted(list(set(filter(None, annotation.split("/")))))
return "/".join(annotation_split)
+
+
+def format_sublanguage_name(lang, language_metadata=_languages):
+ """
+ Formats the name of a sub-language by appending its main language
+ in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language,
+ the original language name is returned as-is.
+
+ Parameters
+ ----------
+ lang : str
+ The name of the language or sub-language to format.
+
+ language_metadata : dict
+ The metadata containing information about main languages and their sub-languages.
+
+ Returns
+ -------
+ str
+ The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk').
+ Otherwise the original name.
+
+ Raises
+ ------
+ ValueError: If the provided language or sub-language is not found.
+
+ Example
+ -------
+ > format_sublanguage_name("nynorsk", language_metadata)
+ 'Norwegian/Nynorsk'
+
+ > format_sublanguage_name("english", language_metadata)
+ 'English'
+ """
+ for main_lang, lang_data in language_metadata.items():
+ # If it's not a sub-language, return the original name.
+ if main_lang == lang.lower():
+ return lang.capitalize()
+
+ # Check if the main language has sub-languages.
+ if "sub_languages" in lang_data:
+ # Check if the provided language is a sub-language.
+ for sub_lang in lang_data["sub_languages"]:
+ if lang.lower() == sub_lang.lower():
+ # Return the formatted name MAIN_LANG/SUB_LANG.
+ return f"{main_lang.capitalize()}/{sub_lang.capitalize()}"
+
+ # Raise ValueError if no match is found.
+ raise ValueError(f"{lang.upper()} is not a valid language or sub-language.")
+
+
+def list_all_languages(language_metadata=_languages):
+ """
+ Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages.
+ """
+ current_languages = []
+
+ # Iterate through the language metadata.
+ for lang_key, lang_data in language_metadata.items():
+ # Check if there are sub-languages.
+ if "sub_languages" in lang_data:
+ # Add the sub-languages to current_languages.
+ current_languages.extend(lang_data["sub_languages"].keys())
+ else:
+ # If no sub-languages, add the main language.
+ current_languages.append(lang_key)
+
+ return sorted(current_languages)
diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py
index 4da51b4f6..a9dba0b9f 100644
--- a/src/scribe_data/wikidata/query_data.py
+++ b/src/scribe_data/wikidata/query_data.py
@@ -33,6 +33,7 @@
from scribe_data.cli.cli_utils import (
language_metadata,
)
+from scribe_data.utils import format_sublanguage_name, list_all_languages
from scribe_data.wikidata.wikidata_utils import sparql
@@ -103,7 +104,7 @@ def query_data(
SCRIBE_DATA_SRC_PATH / "language_data_extraction"
)
languages = [lang.capitalize() for lang in languages]
- current_languages = list(language_metadata["languages"])
+ current_languages = list_all_languages(language_metadata)
current_data_type = ["nouns", "verbs", "prepositions"]
# Assign current_languages and current_data_type if no arguments have been passed.
@@ -147,7 +148,7 @@ def query_data(
disable=interactive,
colour="MAGENTA",
):
- lang = q.parent.parent.name
+ lang = format_sublanguage_name(q.parent.parent.name, language_metadata)
target_type = q.parent.name
updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py
index 03172e077..6fb4bf791 100644
--- a/tests/cli/test_list.py
+++ b/tests/cli/test_list.py
@@ -39,17 +39,49 @@ def test_list_languages(self, mock_print):
list_languages()
expected_calls = [
call(),
- call("Language ISO QID "),
- call("-----------------------"),
- call("English en Q1860 "),
- call("French fr Q150 "),
- call("German de Q188 "),
- call("Italian it Q652 "),
- call("Portuguese pt Q5146 "),
- call("Russian ru Q7737 "),
- call("Spanish es Q1321 "),
- call("Swedish sv Q9027 "),
- call("-----------------------"),
+ call("Language ISO QID "),
+ call("--------------------------"),
+ call("Arabic ar Q13955 "),
+ call("Basque eu Q8752 "),
+ call("Bengali bn Q9610 "),
+ call("Bokmål nb Q25167 "),
+ call("Czech cs Q9056 "),
+ call("Danish da Q9035 "),
+ call("English en Q1860 "),
+ call("Esperanto eo Q143 "),
+ call("Estonian et Q9072 "),
+ call("Finnish fi Q1412 "),
+ call("French fr Q150 "),
+ call("German de Q188 "),
+ call("Greek el Q36510 "),
+ call("Gurmukhi pa Q58635 "),
+ call("Hausa ha Q56475 "),
+ call("Hebrew he Q9288 "),
+ call("Hindi hi Q11051 "),
+ call("Indonesian id Q9240 "),
+ call("Italian it Q652 "),
+ call("Japanese ja Q5287 "),
+ call("Kurmanji kmr Q36163 "),
+ call("Latin la Q397 "),
+ call("Malay ms Q9237 "),
+ call("Malayalam ml Q36236 "),
+ call("Mandarin zh Q727694 "),
+ call("Nigerian pi Q33655 "),
+ call("Nynorsk nn Q25164 "),
+ call("Polish pl Q809 "),
+ call("Portuguese pt Q5146 "),
+ call("Russian ru Q7737 "),
+ call("Shahmukhi pnb Q58635 "),
+ call("Slovak sk Q9058 "),
+ call("Spanish es Q1321 "),
+ call("Swahili sw Q7838 "),
+ call("Swedish sv Q9027 "),
+ call("Tajik tg Q9260 "),
+ call("Tamil ta Q5885 "),
+ call("Ukrainian ua Q8798 "),
+ call("Urdu ur Q11051 "),
+ call("Yoruba yo Q34311 "),
+ call("--------------------------"),
call(),
]
mock_print.assert_has_calls(expected_calls)
@@ -66,6 +98,8 @@ def test_list_data_types_all_languages(self, mock_print):
call("adverbs"),
call("emoji-keywords"),
call("nouns"),
+ call("personal-pronouns"),
+ call("postpositions"),
call("prepositions"),
call("proper-nouns"),
call("verbs"),
@@ -149,14 +183,46 @@ def test_list_languages_for_data_type_valid(self, mock_print):
call(),
call("Available languages: nouns"),
call("--------------------------"),
+ call("Arabic"),
+ call("Basque"),
+ call("Bengali"),
+ call("Chinese/Mandarin"),
+ call("Czech"),
+ call("Danish"),
call("English"),
+ call("Esperanto"),
+ call("Estonian"),
+ call("Finnish"),
call("French"),
call("German"),
+ call("Greek"),
+ call("Hausa"),
+ call("Hebrew"),
+ call("Hindustani/Hindi"),
+ call("Hindustani/Urdu"),
+ call("Indonesian"),
call("Italian"),
+ call("Japanese"),
+ call("Kurmanji"),
+ call("Latin"),
+ call("Malay"),
+ call("Malayalam"),
+ call("Norwegian/Bokmål"),
+ call("Norwegian/Nynorsk"),
+ call("Pidgin/Nigerian"),
+ call("Polish"),
call("Portuguese"),
+ call("Punjabi/Gurmukhi"),
+ call("Punjabi/Shahmukhi"),
call("Russian"),
+ call("Slovak"),
call("Spanish"),
+ call("Swahili"),
call("Swedish"),
+ call("Tajik"),
+ call("Tamil"),
+ call("Ukrainian"),
+ call("Yoruba"),
call("--------------------------"),
call(),
]
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index a827666a2..333c3b7d7 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid):
language=language_qid, data_type=data_type_qid
)
- self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.")
+ self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.")
@patch("scribe_data.cli.total.get_qid_by_input")
def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
@@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid):
language=language_qid, data_type=data_type_qid
)
- self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.")
+ self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.")
@patch("scribe_data.cli.total.get_qid_by_input")
def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
@@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid):
self.assertEqual(
str(context.exception),
- "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.",
+ "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.",
)
def test_validate_language_and_data_type_with_list(self):
@@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self):
data_types = ["nouns", "InvalidDataType"]
with self.assertRaises(ValueError) as context:
validate_language_and_data_type(languages, data_types)
- self.assertIn("Invalid language InvalidLanguage", str(context.exception))
- self.assertIn("Invalid data-type InvalidDataType", str(context.exception))
+ self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception))
+ self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception))
diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index 638ee09dd..43eaa2038 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -21,7 +21,6 @@
"""
import sys
-import unittest
from pathlib import Path
import pytest
@@ -31,25 +30,6 @@
from scribe_data import utils
-def test_get_scribe_languages():
- test_case = unittest.TestCase()
-
- # test for content, not order
- test_case.assertCountEqual(
- utils.get_scribe_languages(),
- [
- "English",
- "French",
- "German",
- "Italian",
- "Portuguese",
- "Russian",
- "Spanish",
- "Swedish",
- ],
- )
-
-
@pytest.mark.parametrize(
"language, qid_code",
[
@@ -61,6 +41,7 @@ def test_get_scribe_languages():
("russian", "Q7737"),
("spanish", "Q1321"),
("swedish", "Q9027"),
+ ("bokmål", "Q25167"),
],
)
def test_get_language_qid_positive(language, qid_code):
@@ -88,6 +69,7 @@ def test_get_language_qid_negative():
("russian", "ru"),
("spanish", "es"),
("SwedisH", "sv"),
+ ("bokmål", "nb"),
],
)
def test_get_language_iso_positive(language, iso_code):
@@ -100,7 +82,7 @@ def test_get_language_iso_negative():
assert (
str(excp.value)
- == "Gibberish is currently not a supported language for ISO conversion."
+ == "GIBBERISH is currently not a supported language for ISO conversion."
)
@@ -115,6 +97,7 @@ def test_get_language_iso_negative():
("ru", "Russian"),
("es", "Spanish"),
("sv", "Swedish"),
+ ("nb", "Bokmål"),
],
)
def test_get_language_from_iso_positive(iso_code, language):
@@ -129,89 +112,69 @@ def test_get_language_from_iso_negative():
@pytest.mark.parametrize(
- "language, remove_words",
- [
- (
- "english",
- [
- "of",
- "the",
- "The",
- "and",
- ],
- ),
- (
- "french",
- [
- "of",
- "the",
- "The",
- "and",
- ],
- ),
- ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]),
- ("italian", ["of", "the", "The", "and", "text", "from"]),
- ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]),
- (
- "russian",
- [
- "of",
- "the",
- "The",
- "and",
- ],
- ),
- ("spanish", ["of", "the", "The", "and"]),
- ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]),
- ],
-)
-def test_get_language_words_to_remove(language, remove_words):
- test_case = unittest.TestCase()
-
- # ignore order, only content matters
- test_case.assertCountEqual(
- utils.get_language_words_to_remove(language), remove_words
- )
-
-
-def test_get_language_words_to_remove_negative():
- with pytest.raises(ValueError) as excp:
- _ = utils.get_language_words_to_remove("python")
-
- assert str(excp.value) == "Python is currently not a supported language."
-
-
-@pytest.mark.parametrize(
- "language, ignore_words",
+ "lang, expected_output",
[
- (
- "french",
- [
- "XXe",
- ],
- ),
- ("german", ["Gemeinde", "Familienname"]),
- ("italian", ["The", "ATP"]),
- ("portuguese", []),
- ("russian", []),
- ("spanish", []),
- ("swedish", ["databasdump"]),
+ ("nynorsk", "Norwegian/Nynorsk"),
+ ("bokmål", "Norwegian/Bokmål"),
+ ("english", "English"),
],
)
-def test_get_language_words_to_ignore(language, ignore_words):
- test_case = unittest.TestCase()
-
- # ignore order, only content matters
- test_case.assertCountEqual(
- utils.get_language_words_to_ignore(language), ignore_words
- )
+def test_format_sublanguage_name_positive(lang, expected_output):
+ assert utils.format_sublanguage_name(lang) == expected_output
-def test_get_language_words_to_ignore_negative():
+def test_format_sublanguage_name_negative():
with pytest.raises(ValueError) as excp:
- _ = utils.get_language_words_to_ignore("JAVA")
-
- assert str(excp.value) == "Java is currently not a supported language."
+ _ = utils.format_sublanguage_name("soccer")
+
+ assert str(excp.value) == "SOCCER is not a valid language or sub-language."
+
+
+def test_list_all_languages():
+ expected_languages = [
+ "arabic",
+ "basque",
+ "bengali",
+ "bokmål",
+ "czech",
+ "danish",
+ "english",
+ "esperanto",
+ "estonian",
+ "finnish",
+ "french",
+ "german",
+ "greek",
+ "gurmukhi",
+ "hausa",
+ "hebrew",
+ "hindi",
+ "indonesian",
+ "italian",
+ "japanese",
+ "kurmanji",
+ "latin",
+ "malay",
+ "malayalam",
+ "mandarin",
+ "nigerian",
+ "nynorsk",
+ "polish",
+ "portuguese",
+ "russian",
+ "shahmukhi",
+ "slovak",
+ "spanish",
+ "swahili",
+ "swedish",
+ "tajik",
+ "tamil",
+ "ukrainian",
+ "urdu",
+ "yoruba",
+ ]
+
+ assert utils.list_all_languages() == expected_languages
def test_get_ios_data_path():