Skip to content

Commit

Permalink
Merge pull request #51 from m-charlton/add-tests-for-utility-functions
Browse files Browse the repository at this point in the history
Add tests for utility functions (resolves #50)
  • Loading branch information
andrewtavis authored Oct 15, 2023
2 parents b32c0e1 + 5ec8357 commit 6714dea
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 90 deletions.
3 changes: 1 addition & 2 deletions src/scribe_data/extract_transform/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

from scribe_data.extract_transform.emoji_utils import get_emoji_codes_to_ignore
from scribe_data.load.update_utils import (
add_num_commas,
get_language_iso,
get_path_from_et_dir,
)
Expand Down Expand Up @@ -199,7 +198,7 @@ def gen_emoji_lexicon(
if emojis_per_keyword and len(emojis) > emojis_per_keyword:
emojis[:] = emojis[:emojis_per_keyword]

total_keywords = add_num_commas(num=len(keyword_dict))
total_keywords = f"{len(keyword_dict):,}"

if verbose:
print(
Expand Down
3 changes: 1 addition & 2 deletions src/scribe_data/extract_transform/process_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from tqdm.auto import tqdm

from scribe_data.utils import ( # get_android_data_path, get_desktop_data_path,
add_num_commas,
get_ios_data_path,
get_language_qid,
get_language_words_to_ignore,
Expand Down Expand Up @@ -142,7 +141,7 @@ def clean(
)

print(
f"Randomly sampling {add_num_commas(len(selected_idxs))} {language.capitalize()} Wikipedia articles..."
f"Randomly sampling {len(selected_idxs):,} {language.capitalize()} Wikipedia articles..."
)
texts = [texts[i] for i in selected_idxs]
print("Random sampling finished.")
Expand Down
35 changes: 3 additions & 32 deletions src/scribe_data/extract_transform/update_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)

from scribe_data.utils import (
add_num_commas,
check_and_return_command_line_args,
get_ios_data_path,
get_path_from_et_dir,
Expand Down Expand Up @@ -240,34 +239,6 @@
json.dump(current_data, f, ensure_ascii=False, indent=0)


def num_add_commas(num):
"""
Adds commas to a numeric string for readability.
Parameters
----------
num : int
An int to have commas added to.
Returns
-------
str_with_commas : str
The original number with commas to make it more readable.
"""
num_str = str(num)

str_list = list(num_str)
str_list = str_list[::-1]

str_list_with_commas = [
f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list)
]

str_list_with_commas = str_list_with_commas[::-1]

return "".join(str_list_with_commas)


# Update data_table.txt
current_data_df = pd.DataFrame(
index=sorted(list(current_data.keys())),
Expand All @@ -277,9 +248,9 @@ def num_add_commas(num):
list(current_data_df.index), list(current_data_df.columns)
):
if wt in current_data[lang].keys():
current_data_df.loc[lang, wt] = num_add_commas(current_data[lang][wt])
current_data_df.loc[lang, wt] = f"{current_data[lang][wt]:,}"
elif wt == "translations":
current_data_df.loc[lang, wt] = num_add_commas(67652)
current_data_df.loc[lang, wt] = f"{67652:,}"

current_data_df.index.name = "Languages"
current_data_df.columns = [c.capitalize() for c in current_data_df.columns]
Expand Down Expand Up @@ -342,7 +313,7 @@ def num_add_commas(num):
elif data_added_dict[l][wt] == 1: # remove the s for label
data_added_string += f" {data_added_dict[l][wt]} {wt[:-1]},"
else:
data_added_string += f" {add_num_commas(data_added_dict[l][wt])} {wt},"
data_added_string += f" {data_added_dict[l][wt]:,} {wt},"

data_added_string = data_added_string[:-1] # remove the last comma

Expand Down
86 changes: 34 additions & 52 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
get_android_data_path,
get_desktop_data_path,
check_command_line_args,
add_num_commas
check_and_return_command_line_args
"""

import ast
from typing import Any


def get_scribe_languages():
def get_scribe_languages() -> list[str]:
"""
Returns the list of currently implemented Scribe languages.
"""
Expand All @@ -40,7 +41,7 @@ def get_scribe_languages():
]


def get_language_qid(language):
def get_language_qid(language: str) -> str:
"""
Returns the QID of the given language.
Expand Down Expand Up @@ -68,13 +69,13 @@ def get_language_qid(language):

if language not in language_qid_dict:
raise ValueError(
f"{language.upper()} is not currently not a supported language for QID conversion."
f"{language.upper()} is currently not a supported language for QID conversion."
)

return language_qid_dict[language]


def get_language_iso(language):
def get_language_iso(language: str) -> str:
"""
Returns the ISO code of the given language.
Expand Down Expand Up @@ -102,13 +103,13 @@ def get_language_iso(language):

if language not in language_iso_dict:
raise ValueError(
f"{language.capitalize()} is not currently not a supported language for ISO conversion."
f"{language.capitalize()} is currently not a supported language for ISO conversion."
)

return language_iso_dict[language]


def get_language_from_iso(iso):
def get_language_from_iso(iso: str) -> str:
"""
Returns the language name for the given ISO.
Expand All @@ -135,14 +136,12 @@ def get_language_from_iso(iso):
}

if iso not in iso_language_dict:
raise ValueError(
f"{iso.upper()} is not currently not a supported ISO for language conversion."
)
raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")

return iso_language_dict[iso]


def get_language_words_to_remove(language):
def get_language_words_to_remove(language: str) -> list[str]:
"""
Returns the words that should not be included as autosuggestions for the given language.
Expand All @@ -156,7 +155,7 @@ def get_language_words_to_remove(language):
The words that should not be included as autosuggestions for the given language as values of a dictionary.
"""
language = language.lower()
language_iso_dict = {
words_to_remove: dict[str, list[str]] = {
"english": [
"of",
"the",
Expand All @@ -182,10 +181,15 @@ def get_language_words_to_remove(language):
"swedish": ["of", "the", "The", "and", "Checklist", "Catalogue"],
}

return language_iso_dict[language]
if language not in words_to_remove:
raise ValueError(
f"{language.capitalize()} is currently not a supported language."
)

return words_to_remove[language]


def get_language_words_to_ignore(language):
def get_language_words_to_ignore(language: str) -> list[str]:
"""
Returns the words that should not be included as autosuggestions for the given language.
Expand All @@ -199,7 +203,7 @@ def get_language_words_to_ignore(language):
The words that should not be included as autosuggestions for the given language as values of a dictionary.
"""
language = language.lower()
language_iso_dict = {
words_to_ignore: dict[str, list[str]] = {
"french": [
"XXe",
],
Expand All @@ -211,31 +215,36 @@ def get_language_words_to_ignore(language):
"swedish": ["databasdump"],
}

return language_iso_dict[language]
if language not in words_to_ignore:
raise ValueError(
f"{language.capitalize()} is currently not a supported language."
)

return words_to_ignore[language]


def get_path_from_format_file():
def get_path_from_format_file() -> str:
"""
Returns the directory path from a data formatting file to scribe-org.
"""
return "../../../../../.."


def get_path_from_load_dir():
def get_path_from_load_dir() -> str:
"""
Returns the directory path from the load directory to scribe-org.
"""
return "../../../.."


def get_path_from_et_dir():
def get_path_from_et_dir() -> str:
"""
Returns the directory path from the extract_transform directory to scribe-org.
"""
return "../../../.."


def get_ios_data_path(language: str):
def get_ios_data_path(language: str) -> str:
"""
Returns the path to the data json of the iOS app given a language.
Expand All @@ -251,7 +260,7 @@ def get_ios_data_path(language: str):
return f"/Scribe-iOS/Keyboards/LanguageKeyboards/{language}"


def get_android_data_path(language: str):
def get_android_data_path(language: str) -> str:
"""
Returns the path to the data json of the Android app given a language.
Expand All @@ -267,7 +276,7 @@ def get_android_data_path(language: str):
return f"/Scribe-Android/app/src/main/LanguageKeyboards/{language}"


def get_desktop_data_path(language: str):
def get_desktop_data_path(language: str) -> str:
"""
Returns the path to the data json of the desktop app given a language.
Expand All @@ -283,7 +292,9 @@ def get_desktop_data_path(language: str):
return f"/Scribe-Desktop/scribe/language_guis/{language}"


def check_command_line_args(file_name, passed_values, values_to_check):
def check_command_line_args(
file_name: str, passed_values: Any, values_to_check: list[str]
) -> list[str]:
"""
Checks command line arguments passed to Scribe-Data files.
Expand Down Expand Up @@ -390,32 +401,3 @@ def check_and_return_command_line_args(
python {all_args[0]} '["comma_separated_sets_in_quotes"]'
"""
)


def add_num_commas(num):
"""
Adds commas to a numeric string for readability.
Parameters
----------
num : int or float
A number to have commas added to.
Returns
-------
str_with_commas : str
The original number with commas to make it more readable.
"""
num_str = str(num)
num_str_no_decimal = num_str.split(".")[0]
decimal = num_str.split(".")[1] if "." in num_str else None

str_list = num_str_no_decimal[::-1]
str_list_with_commas = [
f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list)
]

str_list_with_commas = str_list_with_commas[::-1]
str_with_commas = "".join(str_list_with_commas)

return str_with_commas if decimal is None else f"{str_with_commas}.{decimal}"
Loading

0 comments on commit 6714dea

Please sign in to comment.