Skip to content

Commit

Permalink
Merge pull request #502 from SethiShreya/migration
Browse files Browse the repository at this point in the history
Migration - Lowered the languages once in main, and remove from other places, but capitalize for print statements
  • Loading branch information
andrewtavis authored Nov 8, 2024
2 parents 647d1d9 + a6c7fe8 commit d8c3f4c
Show file tree
Hide file tree
Showing 12 changed files with 121 additions and 105 deletions.
6 changes: 3 additions & 3 deletions src/scribe_data/check/check_project_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def get_available_languages() -> dict[str, list[str]]:
for lang_folder in extraction_dir.iterdir():
if lang_folder.is_dir(): # check if it's a directory
lang_name = (
lang_folder.name.lower()
lang_folder.name
) # normalize keys to lowercase for case-insensitive comparison
sub_languages = []

# Check if lang_folder contains subdirectories.
for sub_folder in lang_folder.iterdir():
if sub_folder.is_dir():
sub_lang_name = (
sub_folder.name.lower()
sub_folder.name
) # normalize to lowercase for case-insensitive comparison.

# Check for almost similar keys using difflib.
Expand Down Expand Up @@ -183,7 +183,7 @@ def check_language_metadata():
SystemExit:
If any missing languages or properties are found, the function exits the script with a status code of 1.
"""
languages_in_metadata = {key.lower(): value for key, value in _languages.items()}
languages_in_metadata = {key: value for key, value in _languages.items()}

languages_in_directory = get_available_languages()

Expand Down
20 changes: 9 additions & 11 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,15 @@ def convert_to_json(
-------
None
"""
normalized_language = language.lower()

if not normalized_language:
if not language:
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

data_types = [data_type] if isinstance(data_type, str) else data_type

if output_dir is None:
output_dir = DEFAULT_JSON_EXPORT_DIR

json_output_dir = Path(output_dir) / normalized_language.capitalize()
json_output_dir = Path(output_dir) / language.capitalize()
json_output_dir.mkdir(parents=True, exist_ok=True)

for dtype in data_types:
Expand Down Expand Up @@ -159,7 +157,7 @@ def convert_to_json(
f"File '{output_file}' already exists. Overwrite? (y/n): "
)
if user_input.lower() != "y":
print(f"Skipping {normalized_language['language']} - {dtype}")
print(f"Skipping {language['language']} - {dtype}")
continue

try:
Expand Down Expand Up @@ -211,9 +209,7 @@ def convert_to_csv_or_tsv(
-------
None
"""
normalized_language = language.lower()

if not normalized_language:
if not language:
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

if isinstance(data_type, str):
Expand Down Expand Up @@ -325,7 +321,7 @@ def convert_to_csv_or_tsv(
print(f"Error writing to '{output_file}': {e}")
continue

print(f"Data for {language} {dtype} written to '{output_file}'")
print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'")


# MARK: SQLITE
Expand Down Expand Up @@ -389,7 +385,7 @@ def convert_to_sqlite(

data_to_sqlite(languages, specific_tables)

source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite"
source_file = f"{get_language_iso(language).capitalize()}LanguageData.sqlite"
source_path = input_file.parent / source_file
target_path = output_dir / source_file

Expand Down Expand Up @@ -443,7 +439,9 @@ def convert_wrapper(
None
"""
output_type = output_type.lower()
print(f"Converting data for {language} {data_type} to {output_type}...")
print(
f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..."
)

# Route the function call to the correct conversion function.
if output_type == "json":
Expand Down
12 changes: 7 additions & 5 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,27 +98,27 @@ def get_data(
# MARK: Get All
if all:
if language:
print(f"Updating all data types for language for {language}")
print(f"Updating all data types for language for {language.capitalize()}")
query_data(
languages=[language],
data_type=None,
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all data types with specified language for {language}."
f"Query completed for all data types with specified language for {language.capitalize()}."
)

elif data_type:
print(f"Updating all languages for data type: {data_type}")
print(f"Updating all languages for data type: {data_type.capitalize()}")
query_data(
languages=None,
data_type=[data_type],
output_dir=output_dir,
overwrite=overwrite,
)
print(
f"Query completed for all languages with specified data type for {data_type}."
f"Query completed for all languages with specified data type for {data_type.capitalize()}."
)

else:
Expand All @@ -142,7 +142,9 @@ def get_data(

elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type
print(f"Updating data for language(s): {language}; data type(s): {data_type}")
print(
f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}"
)
query_data(
languages=languages,
data_type=data_types,
Expand Down
11 changes: 7 additions & 4 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ def main() -> None:

else:
get_data(
language=args.language,
data_type=args.data_type,
language=args.language.lower(),
data_type=args.data_type.lower(),
output_type=args.output_type,
output_dir=args.output_dir,
outputs_per_entry=args.outputs_per_entry,
Expand All @@ -292,14 +292,17 @@ def main() -> None:
elif args.command in ["total", "t"]:
if args.interactive:
start_interactive_mode(operation="total")

else:
total_wrapper(
language=args.language, data_type=args.data_type, all_bool=args.all
language=args.language.lower() if args.language is not None else None,
data_type=args.data_type.lower() if args.data_type is not None else None,
all_bool=args.all,
)

elif args.command in ["convert", "c"]:
convert_wrapper(
language=args.language,
language=args.language.lower(),
data_type=args.data_type,
output_type=args.output_type,
input_file=args.input_file,
Expand Down
45 changes: 25 additions & 20 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,11 @@ def get_qid_by_input(input_str):
The QID corresponding to the input string, or- None if not found.
"""
if input_str:
input_str_lower = input_str.lower()
if input_str_lower in language_to_qid:
return language_to_qid[input_str_lower]
if input_str in language_to_qid:
return language_to_qid[input_str]

elif input_str_lower in data_type_metadata:
return data_type_metadata[input_str_lower]
elif input_str in data_type_metadata:
return data_type_metadata[input_str]

return None

Expand Down Expand Up @@ -102,7 +101,7 @@ def get_datatype_list(language):

if not data_types:
raise ValueError(
f"No data types available for sub-languages of '{formatted_language}'."
f"No data types available for sub-languages of '{formatted_language.capitalize()}'."
)

return sorted(set(data_types)) # remove duplicates and sort
Expand All @@ -116,7 +115,7 @@ def get_datatype_list(language):

if not data_types:
raise ValueError(
f"No data types available for language '{formatted_language}'."
f"No data types available for language '{formatted_language.capitalize()}'."
)

return sorted(data_types)
Expand Down Expand Up @@ -184,12 +183,12 @@ def print_total_lexemes(language: str = None):
and language[1:].isdigit()
):
print(
f"Wikidata QID {language} passed. Checking validity and then all data types."
f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types."
)
language = check_qid_is_language(qid=language)

else:
print(f"Returning total counts for {language} data types...\n")
print(f"Returning total counts for {language.capitalize()} data types...\n")

def print_total_header(language, dt, total_lexemes):
"""
Expand All @@ -210,7 +209,9 @@ def print_total_header(language, dt, total_lexemes):

first_row = True
for dt in data_types:
total_lexemes = get_total_lexemes(lang, dt, False)
total_lexemes = get_total_lexemes(
language=lang, data_type=dt, do_print=False
)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print_total_header(lang, dt, total_lexemes)
Expand All @@ -233,7 +234,9 @@ def print_total_header(language, dt, total_lexemes):
data_types = get_datatype_list(language)

for dt in data_types:
total_lexemes = get_total_lexemes(language, dt, False)
total_lexemes = get_total_lexemes(
language=language, data_type=dt, do_print=False
)
total_lexemes = f"{total_lexemes:,}"
if first_row:
print_total_header(language, dt, total_lexemes)
Expand All @@ -248,7 +251,7 @@ def print_total_header(language, dt, total_lexemes):
# MARK: Get Total


def get_total_lexemes(language, data_type, doPrint=True):
def get_total_lexemes(language, data_type, do_print=True):
"""
Get the total number of lexemes for a given language and data type from Wikidata.
Expand Down Expand Up @@ -344,13 +347,13 @@ def get_total_lexemes(language, data_type, doPrint=True):

output_template = ""
if language:
output_template += f"\nLanguage: {language}\n"
output_template += f"\nLanguage: {language.capitalize()}\n"

if data_type:
output_template += f"Data type: {data_type}\n"

output_template += f"Total number of lexemes: {total_lexemes}\n"
if doPrint:
output_template += f"Total number of lexemes: {total_lexemes:,}\n"
if do_print:
print(output_template)

return total_lexemes
Expand Down Expand Up @@ -399,7 +402,9 @@ def total_wrapper(
True # flag to check if it's the first data type for the language
)
for dt in data_types:
total_lexemes = get_total_lexemes(lang, dt, False)
total_lexemes = get_total_lexemes(
language=lang, data_type=dt, do_print=False
)
total_lexemes = (
f"{total_lexemes:,}" if total_lexemes is not None else "N/A"
)
Expand All @@ -413,16 +418,16 @@ def total_wrapper(
print()

elif language is not None and data_type is None:
print_total_lexemes(language)
print_total_lexemes(language=language)

elif language is not None and not all_bool:
get_total_lexemes(language, data_type)
get_total_lexemes(language=language, data_type=data_type)

elif language is not None:
print(
f"You have already specified language {language} and data type {data_type} - no need to specify --all."
f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all."
)
get_total_lexemes(language, data_type)
get_total_lexemes(language=language, data_type=data_type)

else:
raise ValueError("Invalid input or missing information")
8 changes: 5 additions & 3 deletions src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ def generate_emoji(language, output_dir: str = None):
Path(__file__).parent / "cldr-annotations-full" / "annotations"
)
if iso in os.listdir(path_to_cldr_annotations):
print(f"Emoji Generation for language {language} is supported")
print(f"Emoji Generation for language {language.capitalize()} is supported")

else:
print(f"Emoji Generation for language {language} is not supported")
print(
f"Emoji Generation for language {language.capitalize()} is not supported"
)
return

updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
Expand All @@ -82,6 +84,6 @@ def generate_emoji(language, output_dir: str = None):
file_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language.capitalize(),
language=language,
data_type=DATA_TYPE,
)
Loading

0 comments on commit d8c3f4c

Please sign in to comment.