Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for utility functions (resolves #50) #51

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/scribe_data/extract_transform/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

from scribe_data.extract_transform.emoji_utils import get_emoji_codes_to_ignore
from scribe_data.load.update_utils import (
add_num_commas,
get_language_iso,
get_path_from_et_dir,
)
Expand Down Expand Up @@ -199,7 +198,7 @@ def gen_emoji_lexicon(
if emojis_per_keyword and len(emojis) > emojis_per_keyword:
emojis[:] = emojis[:emojis_per_keyword]

total_keywords = add_num_commas(num=len(keyword_dict))
total_keywords = f"{len(keyword_dict):,}"

if verbose:
print(
Expand Down
3 changes: 1 addition & 2 deletions src/scribe_data/extract_transform/process_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from tqdm.auto import tqdm

from scribe_data.utils import ( # get_android_data_path, get_desktop_data_path,
add_num_commas,
get_ios_data_path,
get_language_qid,
get_language_words_to_ignore,
Expand Down Expand Up @@ -142,7 +141,7 @@ def clean(
)

print(
f"Randomly sampling {add_num_commas(len(selected_idxs))} {language.capitalize()} Wikipedia articles..."
f"Randomly sampling {len(selected_idxs):,} {language.capitalize()} Wikipedia articles..."
)
texts = [texts[i] for i in selected_idxs]
print("Random sampling finished.")
Expand Down
35 changes: 3 additions & 32 deletions src/scribe_data/extract_transform/update_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)

from scribe_data.utils import (
add_num_commas,
check_and_return_command_line_args,
get_ios_data_path,
get_path_from_et_dir,
Expand Down Expand Up @@ -240,34 +239,6 @@
json.dump(current_data, f, ensure_ascii=False, indent=0)


def num_add_commas(num):
"""
Adds commas to a numeric string for readability.

Parameters
----------
num : int
An int to have commas added to.

Returns
-------
str_with_commas : str
The original number with commas to make it more readable.
"""
num_str = str(num)

str_list = list(num_str)
str_list = str_list[::-1]

str_list_with_commas = [
f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list)
]

str_list_with_commas = str_list_with_commas[::-1]

return "".join(str_list_with_commas)


# Update data_table.txt
current_data_df = pd.DataFrame(
index=sorted(list(current_data.keys())),
Expand All @@ -277,9 +248,9 @@ def num_add_commas(num):
list(current_data_df.index), list(current_data_df.columns)
):
if wt in current_data[lang].keys():
current_data_df.loc[lang, wt] = num_add_commas(current_data[lang][wt])
current_data_df.loc[lang, wt] = f"{current_data[lang][wt]:,}"
elif wt == "translations":
current_data_df.loc[lang, wt] = num_add_commas(67652)
current_data_df.loc[lang, wt] = f"{67652:,}"

current_data_df.index.name = "Languages"
current_data_df.columns = [c.capitalize() for c in current_data_df.columns]
Expand Down Expand Up @@ -342,7 +313,7 @@ def num_add_commas(num):
elif data_added_dict[l][wt] == 1: # remove the s for label
data_added_string += f" {data_added_dict[l][wt]} {wt[:-1]},"
else:
data_added_string += f" {add_num_commas(data_added_dict[l][wt])} {wt},"
data_added_string += f" {data_added_dict[l][wt]:,} {wt},"

data_added_string = data_added_string[:-1] # remove the last comma

Expand Down
86 changes: 34 additions & 52 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
get_android_data_path,
get_desktop_data_path,
check_command_line_args,
add_num_commas
check_and_return_command_line_args
"""

import ast
from typing import Any


def get_scribe_languages():
def get_scribe_languages() -> list[str]:
"""
Returns the list of currently implemented Scribe languages.
"""
Expand All @@ -40,7 +41,7 @@ def get_scribe_languages():
]


def get_language_qid(language):
def get_language_qid(language: str) -> str:
"""
Returns the QID of the given language.

Expand Down Expand Up @@ -68,13 +69,13 @@ def get_language_qid(language):

if language not in language_qid_dict:
raise ValueError(
f"{language.upper()} is not currently not a supported language for QID conversion."
f"{language.upper()} is currently not a supported language for QID conversion."
)

return language_qid_dict[language]


def get_language_iso(language):
def get_language_iso(language: str) -> str:
"""
Returns the ISO code of the given language.

Expand Down Expand Up @@ -102,13 +103,13 @@ def get_language_iso(language):

if language not in language_iso_dict:
raise ValueError(
f"{language.capitalize()} is not currently not a supported language for ISO conversion."
f"{language.capitalize()} is currently not a supported language for ISO conversion."
)

return language_iso_dict[language]


def get_language_from_iso(iso):
def get_language_from_iso(iso: str) -> str:
"""
Returns the language name for the given ISO.

Expand All @@ -135,14 +136,12 @@ def get_language_from_iso(iso):
}

if iso not in iso_language_dict:
raise ValueError(
f"{iso.upper()} is not currently not a supported ISO for language conversion."
)
raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")

return iso_language_dict[iso]


def get_language_words_to_remove(language):
def get_language_words_to_remove(language: str) -> list[str]:
"""
Returns the words that should not be included as autosuggestions for the given language.

Expand All @@ -156,7 +155,7 @@ def get_language_words_to_remove(language):
The words that should not be included as autosuggestions for the given language as values of a dictionary.
"""
language = language.lower()
language_iso_dict = {
words_to_remove: dict[str, list[str]] = {
"english": [
"of",
"the",
Expand All @@ -182,10 +181,15 @@ def get_language_words_to_remove(language):
"swedish": ["of", "the", "The", "and", "Checklist", "Catalogue"],
}

return language_iso_dict[language]
if language not in words_to_remove:
raise ValueError(
f"{language.capitalize()} is currently not a supported language."
)

return words_to_remove[language]


def get_language_words_to_ignore(language):
def get_language_words_to_ignore(language: str) -> list[str]:
"""
Returns the words that should not be included as autosuggestions for the given language.

Expand All @@ -199,7 +203,7 @@ def get_language_words_to_ignore(language):
The words that should not be included as autosuggestions for the given language as values of a dictionary.
"""
language = language.lower()
language_iso_dict = {
words_to_ignore: dict[str, list[str]] = {
"french": [
"XXe",
],
Expand All @@ -211,31 +215,36 @@ def get_language_words_to_ignore(language):
"swedish": ["databasdump"],
}

return language_iso_dict[language]
if language not in words_to_ignore:
raise ValueError(
f"{language.capitalize()} is currently not a supported language."
)

return words_to_ignore[language]


def get_path_from_format_file():
def get_path_from_format_file() -> str:
"""
Returns the directory path from a data formatting file to scribe-org.
"""
return "../../../../../.."


def get_path_from_load_dir():
def get_path_from_load_dir() -> str:
"""
Returns the directory path from the load directory to scribe-org.
"""
return "../../../.."


def get_path_from_et_dir():
def get_path_from_et_dir() -> str:
"""
Returns the directory path from the extract_transform directory to scribe-org.
"""
return "../../../.."


def get_ios_data_path(language: str):
def get_ios_data_path(language: str) -> str:
"""
Returns the path to the data json of the iOS app given a language.

Expand All @@ -251,7 +260,7 @@ def get_ios_data_path(language: str):
return f"/Scribe-iOS/Keyboards/LanguageKeyboards/{language}"


def get_android_data_path(language: str):
def get_android_data_path(language: str) -> str:
"""
Returns the path to the data json of the Android app given a language.

Expand All @@ -267,7 +276,7 @@ def get_android_data_path(language: str):
return f"/Scribe-Android/app/src/main/LanguageKeyboards/{language}"


def get_desktop_data_path(language: str):
def get_desktop_data_path(language: str) -> str:
"""
Returns the path to the data json of the desktop app given a language.

Expand All @@ -283,7 +292,9 @@ def get_desktop_data_path(language: str):
return f"/Scribe-Desktop/scribe/language_guis/{language}"


def check_command_line_args(file_name, passed_values, values_to_check):
def check_command_line_args(
file_name: str, passed_values: Any, values_to_check: list[str]
) -> list[str]:
"""
Checks command line arguments passed to Scribe-Data files.

Expand Down Expand Up @@ -390,32 +401,3 @@ def check_and_return_command_line_args(
python {all_args[0]} '["comma_separated_sets_in_quotes"]'
"""
)


def add_num_commas(num):
"""
Adds commas to a numeric string for readability.

Parameters
----------
num : int or float
A number to have commas added to.

Returns
-------
str_with_commas : str
The original number with commas to make it more readable.
"""
num_str = str(num)
num_str_no_decimal = num_str.split(".")[0]
decimal = num_str.split(".")[1] if "." in num_str else None

str_list = num_str_no_decimal[::-1]
str_list_with_commas = [
f"{s}," if i % 3 == 0 and i != 0 else s for i, s in enumerate(str_list)
]

str_list_with_commas = str_list_with_commas[::-1]
str_with_commas = "".join(str_list_with_commas)

return str_with_commas if decimal is None else f"{str_with_commas}.{decimal}"
Loading