Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migration - Lowered the languages once in main, and remove from other places, but capitalize for print statements #502

Merged
merged 6 commits into from
Nov 8, 2024
6 changes: 3 additions & 3 deletions src/scribe_data/check/check_project_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def get_available_languages() -> dict[str, list[str]]:
for lang_folder in extraction_dir.iterdir():
if lang_folder.is_dir(): # check if it's a directory
lang_name = (
lang_folder.name.lower()
lang_folder.name
) # normalize keys to lowercase for case-insensitive comparison
sub_languages = []

# Check if lang_folder contains subdirectories.
for sub_folder in lang_folder.iterdir():
if sub_folder.is_dir():
sub_lang_name = (
sub_folder.name.lower()
sub_folder.name
) # normalize to lowercase for case-insensitive comparison.

# Check for almost similar keys using difflib.
Expand Down Expand Up @@ -183,7 +183,7 @@ def check_language_metadata():
SystemExit:
If any missing languages or properties are found, the function exits the script with a status code of 1.
"""
languages_in_metadata = {key.lower(): value for key, value in _languages.items()}
languages_in_metadata = {key: value for key, value in _languages.items()}

languages_in_directory = get_available_languages()

Expand Down
20 changes: 9 additions & 11 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,15 @@ def convert_to_json(
-------
None
"""
normalized_language = language.lower()

if not normalized_language:
if not language:
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

data_types = [data_type] if isinstance(data_type, str) else data_type

if output_dir is None:
output_dir = DEFAULT_JSON_EXPORT_DIR

json_output_dir = Path(output_dir) / normalized_language.capitalize()
json_output_dir = Path(output_dir) / language.capitalize()
json_output_dir.mkdir(parents=True, exist_ok=True)

for dtype in data_types:
Expand Down Expand Up @@ -159,7 +157,7 @@ def convert_to_json(
f"File '{output_file}' already exists. Overwrite? (y/n): "
)
if user_input.lower() != "y":
print(f"Skipping {normalized_language['language']} - {dtype}")
print(f"Skipping {language['language']} - {dtype}")
continue

try:
Expand Down Expand Up @@ -211,9 +209,7 @@ def convert_to_csv_or_tsv(
-------
None
"""
normalized_language = language.lower()

if not normalized_language:
if not language:
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

if isinstance(data_type, str):
Expand Down Expand Up @@ -325,7 +321,7 @@ def convert_to_csv_or_tsv(
print(f"Error writing to '{output_file}': {e}")
continue

print(f"Data for {language} {dtype} written to '{output_file}'")
print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'")


# MARK: SQLITE
Expand Down Expand Up @@ -389,7 +385,7 @@ def convert_to_sqlite(

data_to_sqlite(languages, specific_tables)

source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite"
source_file = f"{get_language_iso(language).capitalize()}LanguageData.sqlite"
source_path = input_file.parent / source_file
target_path = output_dir / source_file

Expand Down Expand Up @@ -443,7 +439,9 @@ def convert_wrapper(
None
"""
output_type = output_type.lower()
print(f"Converting data for {language} {data_type} to {output_type}...")
print(
f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..."
)

# Route the function call to the correct conversion function.
if output_type == "json":
Expand Down
12 changes: 7 additions & 5 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,27 +98,27 @@ def get_data(
# MARK: Get All
if all:
if language:
print(f"Updating all data types for language for {language}")
print(f"Updating all data types for language for {language.capitalize()}")
query_data(
languages=[language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language}."
f"Query completed for all data types with specified language for {language.capitalize()}."
)

elif data_type:
print(f"Updating all languages for data type: {data_type}")
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
data_type=[data_type],
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all languages with specified data type for {data_type}."
f"Query completed for all languages with specified data type for {data_type.capitalize()}."
)

else:
Expand All @@ -142,7 +142,9 @@ def get_data(

elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type
print(f"Updating data for language(s): {language}; data type(s): {data_type}")
print(
f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}"
)
query_data(
languages=languages,
data_type=data_types,
Expand Down
11 changes: 7 additions & 4 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ def main() -> None:

else:
get_data(
language=args.language,
data_type=args.data_type,
language=args.language.lower(),
data_type=args.data_type.lower(),
output_type=args.output_type,
output_dir=args.output_dir,
outputs_per_entry=args.outputs_per_entry,
Expand All @@ -292,14 +292,17 @@ def main() -> None:
elif args.command in ["total", "t"]:
if args.interactive:
start_interactive_mode(operation="total")

else:
total_wrapper(
language=args.language, data_type=args.data_type, all_bool=args.all
language=args.language.lower() if args.language is not None else None,
data_type=args.data_type.lower() if args.data_type is not None else None,
all_bool=args.all,
)

elif args.command in ["convert", "c"]:
convert_wrapper(
language=args.language,
language=args.language.lower(),
data_type=args.data_type,
output_type=args.output_type,
input_file=args.input_file,
Expand Down
45 changes: 25 additions & 20 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,11 @@ def get_qid_by_input(input_str):
The QID corresponding to the input string, or- None if not found.
"""
if input_str:
input_str_lower = input_str.lower()
if input_str_lower in language_to_qid:
return language_to_qid[input_str_lower]
if input_str in language_to_qid:
return language_to_qid[input_str]

elif input_str_lower in data_type_metadata:
return data_type_metadata[input_str_lower]
elif input_str in data_type_metadata:
return data_type_metadata[input_str]

return None

Expand Down Expand Up @@ -102,7 +101,7 @@ def get_datatype_list(language):

if not data_types:
raise ValueError(
f"No data types available for sub-languages of '{formatted_language}'."
f"No data types available for sub-languages of '{formatted_language.capitalize()}'."
)

return sorted(set(data_types)) # remove duplicates and sort
Expand All @@ -116,7 +115,7 @@ def get_datatype_list(language):

if not data_types:
raise ValueError(
f"No data types available for language '{formatted_language}'."
f"No data types available for language '{formatted_language.capitalize()}'."
)

return sorted(data_types)
Expand Down Expand Up @@ -184,12 +183,12 @@ def print_total_lexemes(language: str = None):
and language[1:].isdigit()
):
print(
f"Wikidata QID {language} passed. Checking validity and then all data types."
f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types."
)
language = check_qid_is_language(qid=language)

else:
print(f"Returning total counts for {language} data types...\n")
print(f"Returning total counts for {language.capitalize()} data types...\n")

def print_total_header(language, dt, total_lexemes):
"""
Expand All @@ -210,7 +209,9 @@ def print_total_header(language, dt, total_lexemes):

first_row = True
for dt in data_types:
total_lexemes = get_total_lexemes(lang, dt, False)
total_lexemes = get_total_lexemes(
language=lang, data_type=dt, do_print=False
)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print_total_header(lang, dt, total_lexemes)
Expand All @@ -233,7 +234,9 @@ def print_total_header(language, dt, total_lexemes):
data_types = get_datatype_list(language)

for dt in data_types:
total_lexemes = get_total_lexemes(language, dt, False)
total_lexemes = get_total_lexemes(
language=language, data_type=dt, do_print=False
)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print_total_header(language, dt, total_lexemes)
Expand All @@ -248,7 +251,7 @@ def print_total_header(language, dt, total_lexemes):
# MARK: Get Total


def get_total_lexemes(language, data_type, doPrint=True):
def get_total_lexemes(language, data_type, do_print=True):
"""
Get the total number of lexemes for a given language and data type from Wikidata.

Expand Down Expand Up @@ -344,13 +347,13 @@ def get_total_lexemes(language, data_type, doPrint=True):

output_template = ""
if language:
output_template += f"\nLanguage: {language}\n"
output_template += f"\nLanguage: {language.capitalize()}\n"

if data_type:
output_template += f"Data type: {data_type}\n"

output_template += f"Total number of lexemes: {total_lexemes}\n"
if doPrint:
output_template += f"Total number of lexemes: {total_lexemes:,}\n"
if do_print:
print(output_template)

return total_lexemes
Expand Down Expand Up @@ -399,7 +402,9 @@ def total_wrapper(
True # flag to check if it's the first data type for the language
)
for dt in data_types:
total_lexemes = get_total_lexemes(lang, dt, False)
total_lexemes = get_total_lexemes(
language=lang, data_type=dt, do_print=False
)
total_lexemes = (
f"{total_lexemes:,}" if total_lexemes is not None else "N/A"
)
Expand All @@ -413,16 +418,16 @@ def total_wrapper(
print()

elif language is not None and data_type is None:
print_total_lexemes(language)
print_total_lexemes(language=language)

elif language is not None and not all_bool:
get_total_lexemes(language, data_type)
get_total_lexemes(language=language, data_type=data_type)

elif language is not None:
print(
f"You have already specified language {language} and data type {data_type} - no need to specify --all."
f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all."
)
get_total_lexemes(language, data_type)
get_total_lexemes(language=language, data_type=data_type)

else:
raise ValueError("Invalid input or missing information")
8 changes: 5 additions & 3 deletions src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ def generate_emoji(language, output_dir: str = None):
Path(__file__).parent / "cldr-annotations-full" / "annotations"
)
if iso in os.listdir(path_to_cldr_annotations):
print(f"Emoji Generation for language {language} is supported")
print(f"Emoji Generation for language {language.capitalize()} is supported")

else:
print(f"Emoji Generation for language {language} is not supported")
print(
f"Emoji Generation for language {language.capitalize()} is not supported"
)
return

updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
Expand All @@ -82,6 +84,6 @@ def generate_emoji(language, output_dir: str = None):
file_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language.capitalize(),
language=language,
data_type=DATA_TYPE,
)
Loading
Loading