diff --git a/src/scribe_data/extract_transform/process_unicode.py b/src/scribe_data/extract_transform/process_unicode.py index 5ed9b9b26..1e8083c3f 100644 --- a/src/scribe_data/extract_transform/process_unicode.py +++ b/src/scribe_data/extract_transform/process_unicode.py @@ -21,7 +21,6 @@ from scribe_data.extract_transform.emoji_utils import get_emoji_codes_to_ignore from scribe_data.load.update_utils import ( - add_num_commas, get_language_iso, get_path_from_et_dir, ) @@ -199,7 +198,7 @@ def gen_emoji_lexicon( if emojis_per_keyword and len(emojis) > emojis_per_keyword: emojis[:] = emojis[:emojis_per_keyword] - total_keywords = add_num_commas(num=len(keyword_dict)) + total_keywords = f"{len(keyword_dict):,}" if verbose: print( diff --git a/src/scribe_data/extract_transform/process_wiki.py b/src/scribe_data/extract_transform/process_wiki.py index 319258687..fbbe2de2a 100644 --- a/src/scribe_data/extract_transform/process_wiki.py +++ b/src/scribe_data/extract_transform/process_wiki.py @@ -22,7 +22,6 @@ from tqdm.auto import tqdm from scribe_data.utils import ( # get_android_data_path, get_desktop_data_path, - add_num_commas, get_ios_data_path, get_language_qid, get_language_words_to_ignore, @@ -142,7 +141,7 @@ def clean( ) print( - f"Randomly sampling {add_num_commas(len(selected_idxs))} {language.capitalize()} Wikipedia articles..." + f"Randomly sampling {len(selected_idxs):,} {language.capitalize()} Wikipedia articles..." ) texts = [texts[i] for i in selected_idxs] print("Random sampling finished.") diff --git a/src/scribe_data/extract_transform/update_data.py b/src/scribe_data/extract_transform/update_data.py index ca13fdac3..d2f508054 100644 --- a/src/scribe_data/extract_transform/update_data.py +++ b/src/scribe_data/extract_transform/update_data.py @@ -32,7 +32,6 @@ sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) from scribe_data.utils import ( - add_num_commas, check_and_return_command_line_args, get_ios_data_path, get_path_from_et_dir, @@ -240,34 +239,6 @@ json.dump(current_data, f, ensure_ascii=False, indent=0) -def num_add_commas(num): - """ - Adds commas to a numeric string for readability. - - Parameters - ---------- - num : int - An int to have commas added to. - - Returns - ------- - str_with_commas : str - The original number with commas to make it more readable. - """ - num_str = str(num) - - str_list = list(num_str) - str_list = str_list[::-1] - - str_list_with_commas = [ - f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list) - ] - - str_list_with_commas = str_list_with_commas[::-1] - - return "".join(str_list_with_commas) - - # Update data_table.txt current_data_df = pd.DataFrame( index=sorted(list(current_data.keys())), @@ -277,9 +248,9 @@ def num_add_commas(num): list(current_data_df.index), list(current_data_df.columns) ): if wt in current_data[lang].keys(): - current_data_df.loc[lang, wt] = num_add_commas(current_data[lang][wt]) + current_data_df.loc[lang, wt] = f"{current_data[lang][wt]:,}" elif wt == "translations": - current_data_df.loc[lang, wt] = num_add_commas(67652) + current_data_df.loc[lang, wt] = f"{67652:,}" current_data_df.index.name = "Languages" current_data_df.columns = [c.capitalize() for c in current_data_df.columns] @@ -342,7 +313,7 @@ def num_add_commas(num): elif data_added_dict[l][wt] == 1: # remove the s for label data_added_string += f" {data_added_dict[l][wt]} {wt[:-1]}," else: - data_added_string += f" {add_num_commas(data_added_dict[l][wt])} {wt}," + data_added_string += f" {data_added_dict[l][wt]:,} {wt}," data_added_string = data_added_string[:-1] # remove the last comma diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 3c7de07a8..da7d48f35 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -18,13 +18,14 @@ get_android_data_path, get_desktop_data_path, check_command_line_args, - add_num_commas + check_and_return_command_line_args """ import ast +from typing import Any -def get_scribe_languages(): +def get_scribe_languages() -> list[str]: """ Returns the list of currently implemented Scribe languages. """ @@ -40,7 +41,7 @@ def get_scribe_languages(): ] -def get_language_qid(language): +def get_language_qid(language: str) -> str: """ Returns the QID of the given language. @@ -68,13 +69,13 @@ def get_language_qid(language): if language not in language_qid_dict: raise ValueError( - f"{language.upper()} is not currently not a supported language for QID conversion." + f"{language.upper()} is currently not a supported language for QID conversion." ) return language_qid_dict[language] -def get_language_iso(language): +def get_language_iso(language: str) -> str: """ Returns the ISO code of the given language. @@ -102,13 +103,13 @@ def get_language_iso(language): if language not in language_iso_dict: raise ValueError( - f"{language.capitalize()} is not currently not a supported language for ISO conversion." + f"{language.capitalize()} is currently not a supported language for ISO conversion." ) return language_iso_dict[language] -def get_language_from_iso(iso): +def get_language_from_iso(iso: str) -> str: """ Returns the language name for the given ISO. @@ -135,14 +136,12 @@ def get_language_from_iso(iso): } if iso not in iso_language_dict: - raise ValueError( - f"{iso.upper()} is not currently not a supported ISO for language conversion." - ) + raise ValueError(f"{iso.upper()} is currently not a supported ISO language.") return iso_language_dict[iso] -def get_language_words_to_remove(language): +def get_language_words_to_remove(language: str) -> list[str]: """ Returns the words that should not be included as autosuggestions for the given language. @@ -156,7 +155,7 @@ def get_language_words_to_remove(language): The words that should not be included as autosuggestions for the given language as values of a dictionary. """ language = language.lower() - language_iso_dict = { + words_to_remove: dict[str, list[str]] = { "english": [ "of", "the", @@ -182,10 +181,15 @@ def get_language_words_to_remove(language): "swedish": ["of", "the", "The", "and", "Checklist", "Catalogue"], } - return language_iso_dict[language] + if language not in words_to_remove: + raise ValueError( + f"{language.capitalize()} is currently not a supported language." + ) + + return words_to_remove[language] -def get_language_words_to_ignore(language): +def get_language_words_to_ignore(language: str) -> list[str]: """ Returns the words that should not be included as autosuggestions for the given language. @@ -199,7 +203,7 @@ def get_language_words_to_ignore(language): The words that should not be included as autosuggestions for the given language as values of a dictionary. """ language = language.lower() - language_iso_dict = { + words_to_ignore: dict[str, list[str]] = { "french": [ "XXe", ], @@ -211,31 +215,36 @@ def get_language_words_to_ignore(language): "swedish": ["databasdump"], } - return language_iso_dict[language] + if language not in words_to_ignore: + raise ValueError( + f"{language.capitalize()} is currently not a supported language." + ) + + return words_to_ignore[language] -def get_path_from_format_file(): +def get_path_from_format_file() -> str: """ Returns the directory path from a data formatting file to scribe-org. """ return "../../../../../.." -def get_path_from_load_dir(): +def get_path_from_load_dir() -> str: """ Returns the directory path from the load directory to scribe-org. """ return "../../../.." -def get_path_from_et_dir(): +def get_path_from_et_dir() -> str: """ Returns the directory path from the extract_transform directory to scribe-org. """ return "../../../.." -def get_ios_data_path(language: str): +def get_ios_data_path(language: str) -> str: """ Returns the path to the data json of the iOS app given a language. @@ -251,7 +260,7 @@ def get_ios_data_path(language: str): return f"/Scribe-iOS/Keyboards/LanguageKeyboards/{language}" -def get_android_data_path(language: str): +def get_android_data_path(language: str) -> str: """ Returns the path to the data json of the Android app given a language. @@ -267,7 +276,7 @@ def get_android_data_path(language: str): return f"/Scribe-Android/app/src/main/LanguageKeyboards/{language}" -def get_desktop_data_path(language: str): +def get_desktop_data_path(language: str) -> str: """ Returns the path to the data json of the desktop app given a language. @@ -283,7 +292,9 @@ def get_desktop_data_path(language: str): return f"/Scribe-Desktop/scribe/language_guis/{language}" -def check_command_line_args(file_name, passed_values, values_to_check): +def check_command_line_args( + file_name: str, passed_values: Any, values_to_check: list[str] +) -> list[str]: """ Checks command line arguments passed to Scribe-Data files. @@ -390,32 +401,3 @@ def check_and_return_command_line_args( python {all_args[0]} '["comma_separated_sets_in_quotes"]' """ ) - - -def add_num_commas(num): - """ - Adds commas to a numeric string for readability. - - Parameters - ---------- - num : int or float - A number to have commas added to. - - Returns - ------- - str_with_commas : str - The original number with commas to make it more readable. - """ - num_str = str(num) - num_str_no_decimal = num_str.split(".")[0] - decimal = num_str.split(".")[1] if "." in num_str else None - - str_list = num_str_no_decimal[::-1] - str_list_with_commas = [ - f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list) - ] - - str_list_with_commas = str_list_with_commas[::-1] - str_with_commas = "".join(str_list_with_commas) - - return str_with_commas if decimal is None else f"{str_with_commas}.{decimal}" diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index d594aacf2..b8722ac79 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -1,5 +1,274 @@ +import unittest +import pytest + from scribe_data import utils -def test_get_language_qid(): - assert utils.get_language_qid("french") == "Q150" +def test_get_scribe_languages(): + test_case = unittest.TestCase() + + # test for content, not order + test_case.assertCountEqual( + utils.get_scribe_languages(), + [ + "English", + "French", + "German", + "Italian", + "Portuguese", + "Russian", + "Spanish", + "Swedish", + ], + ) + + +@pytest.mark.parametrize( + "language, qid_code", + [ + ("English", "Q1860"), + ("french", "Q150"), + ("GERMAN", "Q188"), + ("iTalian", "Q652"), + ("poRTUGuese", "Q5146"), + ("russian", "Q7737"), + ("spanish", "Q1321"), + ("swedish", "Q9027"), + ], +) +def test_get_language_qid_positive(language, qid_code): + assert utils.get_language_qid(language) == qid_code + + +def test_get_language_qid_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.get_language_qid("Newspeak") + + assert ( + str(excp.value) + == "NEWSPEAK is currently not a supported language for QID conversion." + ) + + +@pytest.mark.parametrize( + "language, iso_code", + [ + ("English", "en"), + ("french", "fr"), + ("GERMAN", "de"), + ("iTalian", "it"), + ("poRTUGuese", "pt"), + ("russian", "ru"), + ("spanish", "es"), + ("SwedisH", "sv"), + ], +) +def test_get_language_iso_positive(language, iso_code): + assert utils.get_language_iso(language) == iso_code + + +def test_get_language_iso_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.get_language_iso("gibberish") + + assert ( + str(excp.value) + == "Gibberish is currently not a supported language for ISO conversion." + ) + + +@pytest.mark.parametrize( + "iso_code, language", + [ + ("en", "English"), + ("fr", "French"), + ("de", "German"), + ("it", "Italian"), + ("pt", "Portuguese"), + ("ru", "Russian"), + ("es", "Spanish"), + ("sv", "Swedish"), + ], +) +def test_get_language_from_iso_positive(iso_code, language): + assert utils.get_language_from_iso(iso_code) == language + + +def test_get_language_from_iso_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.get_language_from_iso("ixi") + + assert str(excp.value) == "IXI is currently not a supported ISO language." + + +@pytest.mark.parametrize( + "language, remove_words", + [ + ( + "english", + [ + "of", + "the", + "The", + "and", + ], + ), + ( + "french", + [ + "of", + "the", + "The", + "and", + ], + ), + ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]), + ("italian", ["of", "the", "The", "and", "text", "from"]), + ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]), + ( + "russian", + [ + "of", + "the", + "The", + "and", + ], + ), + ("spanish", ["of", "the", "The", "and"]), + ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]), + ], +) +def test_get_language_words_to_remove(language, remove_words): + test_case = unittest.TestCase() + + # ignore order, only content matters + test_case.assertCountEqual( + utils.get_language_words_to_remove(language), remove_words + ) + + +def test_get_language_words_to_remove_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.get_language_words_to_remove("python") + + assert str(excp.value) == "Python is currently not a supported language." + + +@pytest.mark.parametrize( + "language, ignore_words", + [ + ( + "french", + [ + "XXe", + ], + ), + ("german", ["Gemeinde", "Familienname"]), + ("italian", ["The", "ATP"]), + ("portuguese", []), + ("russian", []), + ("spanish", []), + ("swedish", ["databasdump"]), + ], +) +def test_get_language_words_to_ignore(language, ignore_words): + test_case = unittest.TestCase() + + # ignore order, only content matters + test_case.assertCountEqual( + utils.get_language_words_to_ignore(language), ignore_words + ) + + +def test_get_language_words_to_ignore_negative(): + with pytest.raises(ValueError) as excp: + _ = utils.get_language_words_to_ignore("JAVA") + + assert str(excp.value) == "Java is currently not a supported language." + + +def test_get_path_from_format_file(): + assert utils.get_path_from_format_file() == "../../../../../.." + + +def test_get_path_from_load_dir(): + assert utils.get_path_from_load_dir() == "../../../.." + + +def test_get_path_from_et_dir(): + # TODO: file path is same as above. Is this correct? + assert utils.get_path_from_et_dir() == "../../../.." + + +def test_get_ios_data_path(): + assert ( + utils.get_ios_data_path("suomi") + == "/Scribe-iOS/Keyboards/LanguageKeyboards/suomi" + ) + + +def test_get_android_data_path(): + assert ( + utils.get_android_data_path("Robbie") + == "/Scribe-Android/app/src/main/LanguageKeyboards/Robbie" + ) + + +def test_get_desktop_data_path(): + assert ( + utils.get_desktop_data_path("PAVEMENT") + == "/Scribe-Desktop/scribe/language_guis/PAVEMENT" + ) + + +@pytest.mark.parametrize( + "passed_values, values_to_check, expected", + [ + ("['1', '2', '3']", ["1", "2", "3"], ["1", "2", "3"]), + ("['1', '3', '2']", ["1", "2", "3"], ["1", "3", "2"]), + ("['1', '2']", ["1", "2", "3"], ["1", "2"]), + ("['abc']", ["def", "abc", "ghi"], ["abc"]), + ("[]", ["1", "2", "3"], []), + ], +) +def test_check_command_line_args_positive(passed_values, values_to_check, expected): + assert ( + utils.check_command_line_args("pass.txt", passed_values, values_to_check) + == expected + ) + + +def test_check_command_line_args_fail_not_subset(): + with pytest.raises(ValueError): + _ = utils.check_command_line_args("Fail.txt", "['1', '2', '3']", ["1", "2"]) + + +def test_check_command_line_args_passed_values_not_list(): + with pytest.raises(ValueError): + _ = utils.check_command_line_args("Fail.txt", "('1', '2', '3')", ["1", "2"]) + + +def test_check_command_line_args_passed_values_invalid_arg(): + with pytest.raises(ValueError): + _ = utils.check_command_line_args("Fail.txt", 3, ["3"]) + + +def test_check_and_return_command_line_args_one_arg(): + assert utils.check_and_return_command_line_args(["1"]) == (None, None) + + +def test_check_and_return_command_line_args_two_args(): + assert utils.check_and_return_command_line_args( + ["a.txt", '["1","2"]'], ["1", "2", "3"], ["1", "2", "3"] + ) == (["1", "2"], None) + + +def test_check_and_return_command_line_args_three_args(): + assert utils.check_and_return_command_line_args( + ["a.txt", '["1","2"]', '["3"]'], ["1", "2", "3"], ["1", "2", "3"] + ) == (["1", "2"], ["3"]) + + +def test_check_and_return_command_line_args_too_many_args(): + with pytest.raises(ValueError): + _ = utils.check_and_return_command_line_args(["a", "b", "c", "d"])