From 6a1f13b295335d31de225d6e7d01f7468d89d497 Mon Sep 17 00:00:00 2001 From: shreya Date: Thu, 31 Oct 2024 23:33:41 +0530 Subject: [PATCH 1/5] lowered the language in main, and remove from other places, but capitalize for print statements --- .../check/check_project_metadata.py | 6 ++--- src/scribe_data/cli/convert.py | 8 +++---- src/scribe_data/cli/get.py | 12 +++++----- src/scribe_data/cli/main.py | 12 ++++++---- src/scribe_data/cli/total.py | 24 +++++++++---------- .../unicode/generate_emoji_keywords.py | 6 ++--- src/scribe_data/utils.py | 24 +++++++++---------- src/scribe_data/wikidata/query_data.py | 8 +++---- 8 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py index 1f22638d..84523ba2 100644 --- a/src/scribe_data/check/check_project_metadata.py +++ b/src/scribe_data/check/check_project_metadata.py @@ -50,7 +50,7 @@ def get_available_languages() -> dict[str, list[str]]: for lang_folder in extraction_dir.iterdir(): if lang_folder.is_dir(): # check if it's a directory lang_name = ( - lang_folder.name.lower() + lang_folder.name ) # normalize keys to lowercase for case-insensitive comparison sub_languages = [] @@ -58,7 +58,7 @@ def get_available_languages() -> dict[str, list[str]]: for sub_folder in lang_folder.iterdir(): if sub_folder.is_dir(): sub_lang_name = ( - sub_folder.name.lower() + sub_folder.name ) # normalize to lowercase for case-insensitive comparison. # Check for almost similar keys using difflib. @@ -183,7 +183,7 @@ def check_language_metadata(): SystemExit: If any missing languages or properties are found, the function exits the script with a status code of 1. """ - languages_in_metadata = {key.lower(): value for key, value in _languages.items()} + languages_in_metadata = {key: value for key, value in _languages.items()} languages_in_directory = get_available_languages() diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 44cfa46b..9ba1147c 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -73,7 +73,7 @@ def convert_to_json( ------- None """ - normalized_language = language.lower() + normalized_language = language if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -211,7 +211,7 @@ def convert_to_csv_or_tsv( ------- None """ - normalized_language = language.lower() + normalized_language = language if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -325,7 +325,7 @@ def convert_to_csv_or_tsv( print(f"Error writing to '{output_file}': {e}") continue - print(f"Data for {language} {dtype} written to '{output_file}'") + print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'") # MARK: SQLITE @@ -443,7 +443,7 @@ def convert_wrapper( None """ output_type = output_type.lower() - print(f"Converting data for {language} {data_type} to {output_type}...") + print(f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}...") # Route the function call to the correct conversion function. if output_type == "json": diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 6a0e0426..b0da0b65 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -98,7 +98,7 @@ def get_data( # MARK: Get All if all: if language: - print(f"Updating all data types for language for {language}") + print(f"Updating all data types for language for {language.capitalize()}") query_data( languages=[language], data_type=None, @@ -106,11 +106,11 @@ def get_data( overwrite=overwrite, ) print( - f"Query completed for all data types with specified language for {language}." + f"Query completed for all data types with specified language for {language.capitalize()}." ) elif data_type: - print(f"Updating all languages for data type: {data_type}") + print(f"Updating all languages for data type: {data_type.capitalize()}") query_data( languages=None, data_type=[data_type], @@ -118,7 +118,7 @@ def get_data( overwrite=overwrite, ) print( - f"Query completed for all languages with specified data type for {data_type}." + f"Query completed for all languages with specified data type for {data_type.capitalize()}." ) else: @@ -142,7 +142,7 @@ def get_data( elif language or data_type: data_type = data_type[0] if isinstance(data_type, list) else data_type - print(f"Updating data for language(s): {language}; data type(s): {data_type}") + print(f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}") query_data( languages=languages, data_type=data_types, @@ -167,7 +167,7 @@ def get_data( ): print(f"Updated data was saved in: {Path(output_dir).resolve()}.") - json_input_path = Path(output_dir) / f"{language}/{data_type}.json" + json_input_path = Path(output_dir) / f"{language.capitalize()}/{data_type}.json" # Proceed with conversion only if the output type is not JSON. if output_type != "json": diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 313ab74d..64643bff 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -277,8 +277,8 @@ def main() -> None: else: get_data( - language=args.language, - data_type=args.data_type, + language=args.language.lower(), + data_type=args.data_type.lower(), output_type=args.output_type, output_dir=args.output_dir, outputs_per_entry=args.outputs_per_entry, @@ -288,12 +288,14 @@ def main() -> None: elif args.command in ["total", "t"]: total_wrapper( - language=args.language, data_type=args.data_type, all_bool=args.all - ) + language=args.language.lower() if args.language is not None else None, + data_type=args.data_type.lower() if args.data_type is not None else None, + all_bool=args.all + ) elif args.command in ["convert", "c"]: convert_wrapper( - language=args.language, + language=args.language.lower(), data_type=args.data_type, output_type=args.output_type, input_file=args.input_file, diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 466f0c73..b131b745 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -53,7 +53,7 @@ def get_qid_by_input(input_str): The QID corresponding to the input string, or- None if not found. """ if input_str: - input_str_lower = input_str.lower() + input_str_lower = input_str if input_str_lower in language_to_qid: return language_to_qid[input_str_lower] @@ -79,14 +79,14 @@ def get_datatype_list(language): """ languages = list_all_languages(language_metadata) - if language.lower() in languages: - language_data = language_map.get(language.lower()) + if language in languages: + language_data = language_map.get(language) languages = format_sublanguage_name(language, language_metadata) language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language if not language_data: - raise ValueError(f"Language '{language}' is not recognized.") - + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") + data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] if not data_types: raise ValueError( @@ -164,14 +164,14 @@ def print_total_lexemes(language: str = None): and language[1:].isdigit() ): print( - f"Wikidata QID {language} passed. Checking validity and then all data types." + f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types." ) language = check_qid_is_language(qid=language) else: - print(f"Returning total counts for {language} data types...\n") + print(f"Returning total counts for {language.capitalize()} data types...\n") - def print_total_header(): + def print_total_header(language, dt, total_lexemes): """ Prints the header of the total command output. """ @@ -192,7 +192,7 @@ def print_total_header(): total_lexemes = get_total_lexemes(lang, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print_total_header() + print_total_header(lang, dt, total_lexemes) first_row = False else: @@ -215,7 +215,7 @@ def print_total_header(): total_lexemes = get_total_lexemes(language, dt, False) total_lexemes = f"{total_lexemes:,}" if first_row: - print_total_header() + print_total_header(language, dt, total_lexemes) first_row = False else: @@ -323,7 +323,7 @@ def get_total_lexemes(language, data_type, doPrint=True): output_template = "" if language: - output_template += f"\nLanguage: {language}\n" + output_template += f"\nLanguage: {language.capitalize()}\n" if data_type: output_template += f"Data type: {data_type}\n" @@ -371,7 +371,7 @@ def total_wrapper( elif language is not None: print( - f"You have already specified language {language} and data type {data_type} - no need to specify --all." + f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all." ) get_total_lexemes(language, data_type) diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index beb34257..fd40a8d4 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -64,10 +64,10 @@ def generate_emoji(language, output_dir: str = None): Path(__file__).parent / "cldr-annotations-full" / "annotations" ) if iso in os.listdir(path_to_cldr_annotations): - print(f"Emoji Generation for language {language} is supported") + print(f"Emoji Generation for language {language.capitalize()} is supported") else: - print(f"Emoji Generation for language {language} is not supported") + print(f"Emoji Generation for language {language.capitalize()} is not supported") return updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir @@ -82,6 +82,6 @@ def generate_emoji(language, output_dir: str = None): file_path=output_dir, formatted_data=emoji_keywords_dict, query_data_in_use=True, - language=language.capitalize(), + language=language, data_type=DATA_TYPE, ) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 2d8b5b71..3bf9b119 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -76,15 +76,15 @@ # Process each language and its potential sub-languages in one pass. for lang, lang_data in language_metadata.items(): - lang_lower = lang.lower() + lang_lower = lang if "sub_languages" in lang_data: for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): - sub_lang_lower = sub_lang.lower() + sub_lang_lower = sub_lang sub_qid = sub_lang_data.get("qid") if sub_qid is None: - print(f"Warning: 'qid' missing for sub-language {sub_lang} of {lang}") + print(f"Warning: 'qid' missing for sub-language {sub_lang.capitalize()} of {lang.capitalize()}") else: language_map[sub_lang_lower] = sub_lang_data @@ -93,7 +93,7 @@ else: qid = lang_data.get("qid") if qid is None: - print(f"Warning: 'qid' missing for language {lang}") + print(f"Warning: 'qid' missing for language {lang.capitalize()}") else: language_map[lang_lower] = lang_data @@ -157,23 +157,23 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - """ # Check if we're searching by language name. if source_key == "language": - norm_source_value = source_value.lower() + norm_source_value = source_value # First, check the main language entries (e.g., mandarin, french, etc.). for language, entry in _languages.items(): # If the language name matches the top-level key, return the target value. - if language.lower() == norm_source_value: + if language == norm_source_value: if "sub_languages" in entry: sub_languages = ", ".join(entry["sub_languages"].keys()) raise ValueError( - f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}" + f"'{language.capitalize()}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages.capitalize()}" ) return entry.get(target_key) # If there are sub-languages, check them too. if "sub_languages" in entry: for sub_language, sub_entry in entry["sub_languages"].items(): - if sub_language.lower() == norm_source_value: + if sub_language == norm_source_value: return sub_entry.get(target_key) # If no match was found, raise an error. @@ -311,14 +311,14 @@ def export_formatted_data( ------- None """ - export_path = Path(file_path) / language / f"{data_type.replace('-', '_')}.json" + export_path = Path(file_path) / language.capitalize() / f"{data_type.replace('-', '_')}.json" with open(export_path, "w", encoding="utf-8") as file: json.dump(formatted_data, file, ensure_ascii=False, indent=0) file.write("\n") print( - f"Wrote file {language}/{data_type.replace('-', '_')}.json with {len(formatted_data):,} {data_type}." + f"Wrote file {language.capitalize()}/{data_type.replace('-', '_')}.json with {len(formatted_data):,} {data_type}." ) @@ -581,14 +581,14 @@ def format_sublanguage_name(lang, language_metadata=_languages): """ for main_lang, lang_data in language_metadata.items(): # If it's not a sub-language, return the original name. - if main_lang == lang.lower(): + if main_lang == lang: return lang # Check if the main language has sub-languages. if "sub_languages" in lang_data: # Check if the provided language is a sub-language. for sub_lang in lang_data["sub_languages"]: - if lang.lower() == sub_lang.lower(): + if lang == sub_lang: # Return the formatted name MAIN_LANG/SUB_LANG. return f"{main_lang}/{sub_lang}" diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index ced66272..b88dfc71 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -113,7 +113,7 @@ def query_data( # Assign current_languages and current_data_type if no arguments have been passed. languages_update = current_languages if languages is None else languages - languages_update = [lang.lower() for lang in languages_update] + languages_update = [lang for lang in languages_update] data_type_update = current_data_type if data_type is None else data_type all_language_data_extraction_files = [ @@ -169,7 +169,7 @@ def query_data( else: if not interactive: print( - f"\nExisting file(s) found for {lang} {target_type} in the {output_dir} directory:\n" + f"\nExisting file(s) found for {lang.capitalize()} {target_type} in the {output_dir} directory:\n" ) for i, file in enumerate(existing_files, 1): print(f"{i}. {file.name}") @@ -192,10 +192,10 @@ def query_data( # file_name = f"{target_type}_{timestamp}.json" else: - print(f"Skipping update for {lang} {target_type}.") + print(f"Skipping update for {lang.capitalize()} {target_type}.") break - print(f"Querying and formatting {lang} {target_type}") + print(f"Querying and formatting {lang.capitalize()} {target_type}") # Mark the query as the first in a set of queries if needed. if not q.exists(): From 89f4036c19dc0af748258d7ce4a82f7ef9ae7c2b Mon Sep 17 00:00:00 2001 From: shreya Date: Thu, 31 Oct 2024 23:34:23 +0530 Subject: [PATCH 2/5] lowered the language in main, and remove from other places, but capitalize for print statements --- src/scribe_data/cli/convert.py | 4 +++- src/scribe_data/cli/get.py | 4 +++- src/scribe_data/cli/main.py | 8 +++++--- src/scribe_data/cli/total.py | 2 +- src/scribe_data/unicode/generate_emoji_keywords.py | 4 +++- src/scribe_data/utils.py | 8 ++++++-- 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 9ba1147c..eee3862b 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -443,7 +443,9 @@ def convert_wrapper( None """ output_type = output_type.lower() - print(f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}...") + print( + f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..." + ) # Route the function call to the correct conversion function. if output_type == "json": diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index b0da0b65..b45fc477 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -142,7 +142,9 @@ def get_data( elif language or data_type: data_type = data_type[0] if isinstance(data_type, list) else data_type - print(f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}") + print( + f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}" + ) query_data( languages=languages, data_type=data_types, diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 64643bff..7286b03f 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -289,9 +289,11 @@ def main() -> None: elif args.command in ["total", "t"]: total_wrapper( language=args.language.lower() if args.language is not None else None, - data_type=args.data_type.lower() if args.data_type is not None else None, - all_bool=args.all - ) + data_type=args.data_type.lower() + if args.data_type is not None + else None, + all_bool=args.all, + ) elif args.command in ["convert", "c"]: convert_wrapper( diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index b131b745..40f4f682 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -86,7 +86,7 @@ def get_datatype_list(language): if not language_data: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - + data_types = [f.name for f in language_dir.iterdir() if f.is_dir()] if not data_types: raise ValueError( diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index fd40a8d4..a2f17d18 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -67,7 +67,9 @@ def generate_emoji(language, output_dir: str = None): print(f"Emoji Generation for language {language.capitalize()} is supported") else: - print(f"Emoji Generation for language {language.capitalize()} is not supported") + print( + f"Emoji Generation for language {language.capitalize()} is not supported" + ) return updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 3bf9b119..f0a16340 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -84,7 +84,9 @@ sub_qid = sub_lang_data.get("qid") if sub_qid is None: - print(f"Warning: 'qid' missing for sub-language {sub_lang.capitalize()} of {lang.capitalize()}") + print( + f"Warning: 'qid' missing for sub-language {sub_lang.capitalize()} of {lang.capitalize()}" + ) else: language_map[sub_lang_lower] = sub_lang_data @@ -311,7 +313,9 @@ def export_formatted_data( ------- None """ - export_path = Path(file_path) / language.capitalize() / f"{data_type.replace('-', '_')}.json" + export_path = ( + Path(file_path) / language.capitalize() / f"{data_type.replace('-', '_')}.json" + ) with open(export_path, "w", encoding="utf-8") as file: json.dump(formatted_data, file, ensure_ascii=False, indent=0) From 02df98651e8e0c2522b4ae940f76cc4cb6115521 Mon Sep 17 00:00:00 2001 From: shreya Date: Fri, 1 Nov 2024 19:20:25 +0530 Subject: [PATCH 3/5] lowered the output directory name --- src/scribe_data/cli/get.py | 2 +- src/scribe_data/utils.py | 6 +++--- .../language_data_extraction/english/verbs/format_verbs.py | 3 ++- src/scribe_data/wikidata/query_data.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index b45fc477..c02908aa 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -169,7 +169,7 @@ def get_data( ): print(f"Updated data was saved in: {Path(output_dir).resolve()}.") - json_input_path = Path(output_dir) / f"{language.capitalize()}/{data_type}.json" + json_input_path = Path(output_dir) / f"{language}/{data_type}.json" # Proceed with conversion only if the output type is not JSON. if output_type != "json": diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index f0a16340..5ce3639c 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -279,7 +279,7 @@ def load_queried_data( tuple(Any, str) A tuple containing the loaded data and the path to the data file. """ - data_path = Path(file_path) / language / f"{data_type}.json" + data_path = Path(file_path) / language.lower() / f"{data_type}.json" with open(data_path, encoding="utf-8") as f: return json.load(f), data_path @@ -314,7 +314,7 @@ def export_formatted_data( None """ export_path = ( - Path(file_path) / language.capitalize() / f"{data_type.replace('-', '_')}.json" + Path(file_path) / language.lower() / f"{data_type.replace('-', '_')}.json" ) with open(export_path, "w", encoding="utf-8") as file: @@ -322,7 +322,7 @@ def export_formatted_data( file.write("\n") print( - f"Wrote file {language.capitalize()}/{data_type.replace('-', '_')}.json with {len(formatted_data):,} {data_type}." + f"Wrote file {language.lower()}/{data_type.replace('-', '_')}.json with {len(formatted_data):,} {data_type}." ) diff --git a/src/scribe_data/wikidata/language_data_extraction/english/verbs/format_verbs.py b/src/scribe_data/wikidata/language_data_extraction/english/verbs/format_verbs.py index b9983352..3414ae07 100644 --- a/src/scribe_data/wikidata/language_data_extraction/english/verbs/format_verbs.py +++ b/src/scribe_data/wikidata/language_data_extraction/english/verbs/format_verbs.py @@ -25,9 +25,10 @@ from scribe_data.utils import export_formatted_data, load_queried_data -LANGUAGE = "English" +LANGUAGE = "english" DATA_TYPE = "verbs" + parser = argparse.ArgumentParser() parser.add_argument("--file-path") args = parser.parse_args() diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index b88dfc71..54b70ee1 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -154,7 +154,7 @@ def query_data( target_type = q.parent.name updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir - export_dir = Path(updated_path) / lang.capitalize() + export_dir = Path(updated_path) / lang export_dir.mkdir(parents=True, exist_ok=True) file_name = f"{target_type}.json" From 24c0e8f4395b572ba475b70976d5df3ba7569b51 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 8 Nov 2024 17:45:45 +0100 Subject: [PATCH 4/5] Fix tests now that utils args are lower and make cli fxn args explicit --- src/scribe_data/cli/convert.py | 2 +- src/scribe_data/cli/total.py | 24 +++++++++++++++--------- src/scribe_data/utils.py | 31 ++++++++++++++++--------------- tests/cli/test_list.py | 2 +- tests/cli/test_total.py | 24 +++++++++++++----------- tests/load/test_update_utils.py | 28 ++++++++++++++-------------- 6 files changed, 60 insertions(+), 51 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index eee3862b..569e6e2b 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -389,7 +389,7 @@ def convert_to_sqlite( data_to_sqlite(languages, specific_tables) - source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite" + source_file = f"{get_language_iso(language).capitalize()}LanguageData.sqlite" source_path = input_file.parent / source_file target_path = output_dir / source_file diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index 58dc1aba..eeafdf15 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -210,7 +210,9 @@ def print_total_header(language, dt, total_lexemes): first_row = True for dt in data_types: - total_lexemes = get_total_lexemes(lang, dt, False) + total_lexemes = get_total_lexemes( + language=lang, data_type=dt, do_print=False + ) total_lexemes = f"{total_lexemes:,}" if first_row: print_total_header(lang, dt, total_lexemes) @@ -233,7 +235,9 @@ def print_total_header(language, dt, total_lexemes): data_types = get_datatype_list(language) for dt in data_types: - total_lexemes = get_total_lexemes(language, dt, False) + total_lexemes = get_total_lexemes( + language=language, data_type=dt, do_print=False + ) total_lexemes = f"{total_lexemes:,}" if first_row: print_total_header(language, dt, total_lexemes) @@ -248,7 +252,7 @@ def print_total_header(language, dt, total_lexemes): # MARK: Get Total -def get_total_lexemes(language, data_type, doPrint=True): +def get_total_lexemes(language, data_type, do_print=True): """ Get the total number of lexemes for a given language and data type from Wikidata. @@ -349,8 +353,8 @@ def get_total_lexemes(language, data_type, doPrint=True): if data_type: output_template += f"Data type: {data_type}\n" - output_template += f"Total number of lexemes: {total_lexemes}\n" - if doPrint: + output_template += f"Total number of lexemes: {total_lexemes:,}\n" + if do_print: print(output_template) return total_lexemes @@ -399,7 +403,9 @@ def total_wrapper( True # flag to check if it's the first data type for the language ) for dt in data_types: - total_lexemes = get_total_lexemes(lang, dt, False) + total_lexemes = get_total_lexemes( + language=lang, data_type=dt, do_print=False + ) total_lexemes = ( f"{total_lexemes:,}" if total_lexemes is not None else "N/A" ) @@ -413,16 +419,16 @@ def total_wrapper( print() elif language is not None and data_type is None: - print_total_lexemes(language) + print_total_lexemes(language=language) elif language is not None and not all_bool: - get_total_lexemes(language, data_type) + get_total_lexemes(language=language, data_type=data_type) elif language is not None: print( f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all." ) - get_total_lexemes(language, data_type) + get_total_lexemes(language=language, data_type=data_type) else: raise ValueError("Invalid input or missing information") diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 5ce3639c..08194a63 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -159,23 +159,24 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - """ # Check if we're searching by language name. if source_key == "language": - norm_source_value = source_value - # First, check the main language entries (e.g., mandarin, french, etc.). for language, entry in _languages.items(): # If the language name matches the top-level key, return the target value. - if language == norm_source_value: + if language == source_value: if "sub_languages" in entry: - sub_languages = ", ".join(entry["sub_languages"].keys()) + sub_languages = entry["sub_languages"].keys() + sub_languages = ", ".join( + lang.capitalize() for lang in sub_languages + ) raise ValueError( - f"'{language.capitalize()}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages.capitalize()}" + f"'{language.capitalize()}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}" ) return entry.get(target_key) # If there are sub-languages, check them too. if "sub_languages" in entry: for sub_language, sub_entry in entry["sub_languages"].items(): - if sub_language == norm_source_value: + if sub_language == source_value: return sub_entry.get(target_key) # If no match was found, raise an error. @@ -197,10 +198,10 @@ def get_language_qid(language: str) -> str: The Wikidata QID for the language. """ return _find( - "language", - language, - "qid", - f"{language.upper()} is currently not a supported language for QID conversion.", + source_key="language", + source_value=language, + target_key="qid", + error_msg=f"{language.capitalize()} is currently not a supported language for QID conversion.", ) @@ -220,10 +221,10 @@ def get_language_iso(language: str) -> str: """ return _find( - "language", - language, - "iso", - f"{language.upper()} is currently not a supported language for ISO conversion.", + source_key="language", + source_value=language, + target_key="iso", + error_msg=f"{language.capitalize()} is currently not a supported language for ISO conversion.", ) @@ -597,7 +598,7 @@ def format_sublanguage_name(lang, language_metadata=_languages): return f"{main_lang}/{sub_lang}" # Raise ValueError if no match is found. - raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") + raise ValueError(f"{lang.capitalize()} is not a valid language or sub-language.") def list_all_languages(language_metadata=_languages): diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 1e4d708a..16f34394 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -88,7 +88,7 @@ def test_list_data_types_all_languages(self, mock_print): @patch("builtins.print") def test_list_data_types_specific_language(self, mock_print): - list_data_types("English") + list_data_types("english") expected_calls = [ call(), diff --git a/tests/cli/test_total.py b/tests/cli/test_total.py index 0bbe340f..a8145f04 100644 --- a/tests/cli/test_total.py +++ b/tests/cli/test_total.py @@ -46,7 +46,7 @@ def test_get_total_lexemes_valid(self, mock_query, mock_get_qid): mock_query.return_value = mock_results with patch("builtins.print") as mock_print: - get_total_lexemes("English", "nouns") + get_total_lexemes(language="English", data_type="nouns") mock_print.assert_called_once_with( "\nLanguage: English\nData type: nouns\nTotal number of lexemes: 42\n" @@ -63,7 +63,7 @@ def test_get_total_lexemes_no_results(self, mock_query, mock_get_qid): mock_query.return_value = mock_results with patch("builtins.print") as mock_print: - get_total_lexemes("English", "nouns") + get_total_lexemes(language="English", data_type="nouns") mock_print.assert_called_once_with("Total number of lexemes: Not found") @@ -74,7 +74,7 @@ def test_get_total_lexemes_invalid_language(self, mock_query, mock_get_qid): mock_query.return_value = MagicMock() with patch("builtins.print") as mock_print: - get_total_lexemes("InvalidLanguage", "nouns") + get_total_lexemes(language="InvalidLanguage", data_type="nouns") mock_print.assert_called_once_with("Total number of lexemes: Not found") @@ -86,7 +86,7 @@ def test_get_total_lexemes_empty_and_none_inputs(self, mock_query, mock_get_qid) # Call the function with empty and None inputs with patch("builtins.print") as mock_print: - get_total_lexemes("", "nouns") + get_total_lexemes(language="", data_type="nouns") get_total_lexemes(None, "verbs") expected_calls = [ @@ -102,7 +102,7 @@ def test_get_total_lexemes_nonexistent_language(self, mock_query, mock_get_qid): mock_query.return_value = MagicMock() with patch("builtins.print") as mock_print: - get_total_lexemes("Martian", "nouns") + get_total_lexemes(language="Martian", data_type="nouns") mock_print.assert_called_once_with("Total number of lexemes: Not found") @@ -123,8 +123,8 @@ def test_get_total_lexemes_various_data_types(self, mock_query, mock_get_qid): # Call the function with different data types with patch("builtins.print") as mock_print: - get_total_lexemes("English", "verbs") - get_total_lexemes("English", "nouns") + get_total_lexemes(language="English", data_type="verbs") + get_total_lexemes(language="English", data_type="nouns") expected_calls = [ call( @@ -159,8 +159,8 @@ def test_get_total_lexemes_sub_languages(self, mock_dir, mock_query, mock_get_qi ] with patch("builtins.print") as mock_print: - get_total_lexemes("Norwegian", "verbs") - get_total_lexemes("Norwegian", "nouns") + get_total_lexemes(language="Norwegian", data_type="verbs") + get_total_lexemes(language="Norwegian", data_type="nouns") expected_calls = [ call( @@ -250,12 +250,14 @@ def test_total_wrapper_all_bool(self, mock_print_total_lexemes): @patch("scribe_data.cli.total.print_total_lexemes") def test_total_wrapper_language_only(self, mock_print_total_lexemes): total_wrapper(language="English") - mock_print_total_lexemes.assert_called_once_with("English") + mock_print_total_lexemes.assert_called_once_with(language="English") @patch("scribe_data.cli.total.get_total_lexemes") def test_total_wrapper_language_and_data_type(self, mock_get_total_lexemes): total_wrapper(language="English", data_type="nouns") - mock_get_total_lexemes.assert_called_once_with("English", "nouns") + mock_get_total_lexemes.assert_called_once_with( + language="English", data_type="nouns" + ) def test_total_wrapper_invalid_input(self): with self.assertRaises(ValueError): diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 264fe94c..8a1e4f2f 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -33,11 +33,11 @@ @pytest.mark.parametrize( "language, qid_code", [ - ("English", "Q1860"), + ("english", "Q1860"), ("french", "Q150"), - ("GERMAN", "Q188"), - ("iTalian", "Q652"), - ("poRTUGuese", "Q5146"), + ("german", "Q188"), + ("italian", "Q652"), + ("portuguese", "Q5146"), ("russian", "Q7737"), ("spanish", "Q1321"), ("swedish", "Q9027"), @@ -54,21 +54,21 @@ def test_get_language_qid_negative(): assert ( str(excp.value) - == "NEWSPEAK is currently not a supported language for QID conversion." + == "Newspeak is currently not a supported language for QID conversion." ) @pytest.mark.parametrize( "language, iso_code", [ - ("English", "en"), + ("english", "en"), ("french", "fr"), - ("GERMAN", "de"), - ("iTalian", "it"), - ("poRTUGuese", "pt"), + ("german", "de"), + ("italian", "it"), + ("portuguese", "pt"), ("russian", "ru"), ("spanish", "es"), - ("SwedisH", "sv"), + ("swedish", "sv"), ("bokmål", "nb"), ], ) @@ -78,11 +78,11 @@ def test_get_language_iso_positive(language, iso_code): def test_get_language_iso_negative(): with pytest.raises(ValueError) as excp: - _ = utils.get_language_iso("gibberish") + _ = utils.get_language_iso("Gibberish") assert ( str(excp.value) - == "GIBBERISH is currently not a supported language for ISO conversion." + == "Gibberish is currently not a supported language for ISO conversion." ) @@ -125,9 +125,9 @@ def test_format_sublanguage_name_positive(lang, expected_output): def test_format_sublanguage_name_negative(): with pytest.raises(ValueError) as excp: - _ = utils.format_sublanguage_name("soccer") + _ = utils.format_sublanguage_name("Silence") - assert str(excp.value) == "SOCCER is not a valid language or sub-language." + assert str(excp.value) == "Silence is not a valid language or sub-language." def test_list_all_languages(): From a6c7fe8ce7609d5fbf060df472c10f3f7a10748d Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 8 Nov 2024 17:53:44 +0100 Subject: [PATCH 5/5] Removing assignments that used to have .lower() --- src/scribe_data/cli/convert.py | 12 ++++-------- src/scribe_data/cli/total.py | 9 ++++----- src/scribe_data/utils.py | 6 ++---- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 569e6e2b..1ef07061 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -73,9 +73,7 @@ def convert_to_json( ------- None """ - normalized_language = language - - if not normalized_language: + if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") data_types = [data_type] if isinstance(data_type, str) else data_type @@ -83,7 +81,7 @@ def convert_to_json( if output_dir is None: output_dir = DEFAULT_JSON_EXPORT_DIR - json_output_dir = Path(output_dir) / normalized_language.capitalize() + json_output_dir = Path(output_dir) / language.capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) for dtype in data_types: @@ -159,7 +157,7 @@ def convert_to_json( f"File '{output_file}' already exists. Overwrite? (y/n): " ) if user_input.lower() != "y": - print(f"Skipping {normalized_language['language']} - {dtype}") + print(f"Skipping {language['language']} - {dtype}") continue try: @@ -211,9 +209,7 @@ def convert_to_csv_or_tsv( ------- None """ - normalized_language = language - - if not normalized_language: + if not language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") if isinstance(data_type, str): diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index eeafdf15..e0619695 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -53,12 +53,11 @@ def get_qid_by_input(input_str): The QID corresponding to the input string, or- None if not found. """ if input_str: - input_str_lower = input_str - if input_str_lower in language_to_qid: - return language_to_qid[input_str_lower] + if input_str in language_to_qid: + return language_to_qid[input_str] - elif input_str_lower in data_type_metadata: - return data_type_metadata[input_str_lower] + elif input_str in data_type_metadata: + return data_type_metadata[input_str] return None diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 08194a63..fbd3db2b 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -76,8 +76,6 @@ # Process each language and its potential sub-languages in one pass. for lang, lang_data in language_metadata.items(): - lang_lower = lang - if "sub_languages" in lang_data: for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): sub_lang_lower = sub_lang @@ -98,8 +96,8 @@ print(f"Warning: 'qid' missing for language {lang.capitalize()}") else: - language_map[lang_lower] = lang_data - language_to_qid[lang_lower] = qid + language_map[lang] = lang_data + language_to_qid[lang] = qid def _load_json(package_path: str, file_name: str) -> Any: