Merge pull request #502 from SethiShreya/migration

Migration - Lowered the languages once in main, and remove from other places, but capitalize for print statements
scribe-org · Nov 8, 2024 · d8c3f4c · d8c3f4c
2 parents 647d1d9 + a6c7fe8
commit d8c3f4c
Show file tree

Hide file tree

Showing 12 changed files with 121 additions and 105 deletions.
diff --git a/src/scribe_data/check/check_project_metadata.py b/src/scribe_data/check/check_project_metadata.py
@@ -50,15 +50,15 @@ def get_available_languages() -> dict[str, list[str]]:
     for lang_folder in extraction_dir.iterdir():
         if lang_folder.is_dir():  # check if it's a directory
             lang_name = (
-                lang_folder.name.lower()
+                lang_folder.name
             )  # normalize keys to lowercase for case-insensitive comparison
             sub_languages = []
 
             # Check if lang_folder contains subdirectories.
             for sub_folder in lang_folder.iterdir():
                 if sub_folder.is_dir():
                     sub_lang_name = (
-                        sub_folder.name.lower()
+                        sub_folder.name
                     )  # normalize to lowercase for case-insensitive comparison.
 
                     # Check for almost similar keys using difflib.
@@ -183,7 +183,7 @@ def check_language_metadata():
         SystemExit:
             If any missing languages or properties are found, the function exits the script with a status code of 1.
     """
-    languages_in_metadata = {key.lower(): value for key, value in _languages.items()}
+    languages_in_metadata = {key: value for key, value in _languages.items()}
 
     languages_in_directory = get_available_languages()
 

diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py
@@ -73,17 +73,15 @@ def convert_to_json(
     -------
         None
     """
-    normalized_language = language.lower()
-
-    if not normalized_language:
+    if not language:
         raise ValueError(f"Language '{language.capitalize()}' is not recognized.")
 
     data_types = [data_type] if isinstance(data_type, str) else data_type
 
     if output_dir is None:
         output_dir = DEFAULT_JSON_EXPORT_DIR
 
-    json_output_dir = Path(output_dir) / normalized_language.capitalize()
+    json_output_dir = Path(output_dir) / language.capitalize()
     json_output_dir.mkdir(parents=True, exist_ok=True)
 
     for dtype in data_types:
@@ -159,7 +157,7 @@ def convert_to_json(
                 f"File '{output_file}' already exists. Overwrite? (y/n): "
             )
             if user_input.lower() != "y":
-                print(f"Skipping {normalized_language['language']} - {dtype}")
+                print(f"Skipping {language['language']} - {dtype}")
                 continue
 
         try:
@@ -211,9 +209,7 @@ def convert_to_csv_or_tsv(
     -------
         None
     """
-    normalized_language = language.lower()
-
-    if not normalized_language:
+    if not language:
         raise ValueError(f"Language '{language.capitalize()}' is not recognized.")
 
     if isinstance(data_type, str):
@@ -325,7 +321,7 @@ def convert_to_csv_or_tsv(
             print(f"Error writing to '{output_file}': {e}")
             continue
 
-        print(f"Data for {language} {dtype} written to '{output_file}'")
+        print(f"Data for {language.capitalize()} {dtype} written to '{output_file}'")
 
 
 # MARK: SQLITE
@@ -389,7 +385,7 @@ def convert_to_sqlite(
 
     data_to_sqlite(languages, specific_tables)
 
-    source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite"
+    source_file = f"{get_language_iso(language).capitalize()}LanguageData.sqlite"
     source_path = input_file.parent / source_file
     target_path = output_dir / source_file
 
@@ -443,7 +439,9 @@ def convert_wrapper(
     None
     """
     output_type = output_type.lower()
-    print(f"Converting data for {language} {data_type} to {output_type}...")
+    print(
+        f"Converting data for {language.capitalize()} {data_type.capitalize()} to {output_type}..."
+    )
 
     # Route the function call to the correct conversion function.
     if output_type == "json":

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -98,27 +98,27 @@ def get_data(
     # MARK: Get All
     if all:
         if language:
-            print(f"Updating all data types for language for {language}")
+            print(f"Updating all data types for language for {language.capitalize()}")
             query_data(
                 languages=[language],
                 data_type=None,
                 output_dir=output_dir,
                 overwrite=overwrite,
             )
             print(
-                f"Query completed for all data types with specified language for {language}."
+                f"Query completed for all data types with specified language for {language.capitalize()}."
             )
 
         elif data_type:
-            print(f"Updating all languages for data type: {data_type}")
+            print(f"Updating all languages for data type: {data_type.capitalize()}")
             query_data(
                 languages=None,
                 data_type=[data_type],
                 output_dir=output_dir,
                 overwrite=overwrite,
             )
             print(
-                f"Query completed for all languages with specified data type for {data_type}."
+                f"Query completed for all languages with specified data type for {data_type.capitalize()}."
             )
 
         else:
@@ -142,7 +142,9 @@ def get_data(
 
     elif language or data_type:
         data_type = data_type[0] if isinstance(data_type, list) else data_type
-        print(f"Updating data for language(s): {language}; data type(s): {data_type}")
+        print(
+            f"Updating data for language(s): {language.capitalize()}; data type(s): {data_type.capitalize()}"
+        )
         query_data(
             languages=languages,
             data_type=data_types,

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -280,8 +280,8 @@ def main() -> None:
 
             else:
                 get_data(
-                    language=args.language,
-                    data_type=args.data_type,
+                    language=args.language.lower(),
+                    data_type=args.data_type.lower(),
                     output_type=args.output_type,
                     output_dir=args.output_dir,
                     outputs_per_entry=args.outputs_per_entry,
@@ -292,14 +292,17 @@ def main() -> None:
         elif args.command in ["total", "t"]:
             if args.interactive:
                 start_interactive_mode(operation="total")
+
             else:
                 total_wrapper(
-                    language=args.language, data_type=args.data_type, all_bool=args.all
+                    language=args.language.lower() if args.language is not None else None,
+                    data_type=args.data_type.lower() if args.data_type is not None else None,
+                    all_bool=args.all,
                 )
 
         elif args.command in ["convert", "c"]:
             convert_wrapper(
-                language=args.language,
+                language=args.language.lower(),
                 data_type=args.data_type,
                 output_type=args.output_type,
                 input_file=args.input_file,

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -53,12 +53,11 @@ def get_qid_by_input(input_str):
             The QID corresponding to the input string, or- None if not found.
     """
     if input_str:
-        input_str_lower = input_str.lower()
-        if input_str_lower in language_to_qid:
-            return language_to_qid[input_str_lower]
+        if input_str in language_to_qid:
+            return language_to_qid[input_str]
 
-        elif input_str_lower in data_type_metadata:
-            return data_type_metadata[input_str_lower]
+        elif input_str in data_type_metadata:
+            return data_type_metadata[input_str]
 
     return None
 
@@ -102,7 +101,7 @@ def get_datatype_list(language):
 
             if not data_types:
                 raise ValueError(
-                    f"No data types available for sub-languages of '{formatted_language}'."
+                    f"No data types available for sub-languages of '{formatted_language.capitalize()}'."
                 )
 
             return sorted(set(data_types))  # remove duplicates and sort
@@ -116,7 +115,7 @@ def get_datatype_list(language):
 
             if not data_types:
                 raise ValueError(
-                    f"No data types available for language '{formatted_language}'."
+                    f"No data types available for language '{formatted_language.capitalize()}'."
                 )
 
             return sorted(data_types)
@@ -184,12 +183,12 @@ def print_total_lexemes(language: str = None):
         and language[1:].isdigit()
     ):
         print(
-            f"Wikidata QID {language} passed. Checking validity and then all data types."
+            f"Wikidata QID {language.capitalize()} passed. Checking validity and then all data types."
         )
         language = check_qid_is_language(qid=language)
 
     else:
-        print(f"Returning total counts for {language} data types...\n")
+        print(f"Returning total counts for {language.capitalize()} data types...\n")
 
     def print_total_header(language, dt, total_lexemes):
         """
@@ -210,7 +209,9 @@ def print_total_header(language, dt, total_lexemes):
 
             first_row = True
             for dt in data_types:
-                total_lexemes = get_total_lexemes(lang, dt, False)
+                total_lexemes = get_total_lexemes(
+                    language=lang, data_type=dt, do_print=False
+                )
                 total_lexemes = f"{total_lexemes:,}"
                 if first_row:
                     print_total_header(lang, dt, total_lexemes)
@@ -233,7 +234,9 @@ def print_total_header(language, dt, total_lexemes):
             data_types = get_datatype_list(language)
 
         for dt in data_types:
-            total_lexemes = get_total_lexemes(language, dt, False)
+            total_lexemes = get_total_lexemes(
+                language=language, data_type=dt, do_print=False
+            )
             total_lexemes = f"{total_lexemes:,}"
             if first_row:
                 print_total_header(language, dt, total_lexemes)
@@ -248,7 +251,7 @@ def print_total_header(language, dt, total_lexemes):
 # MARK: Get Total
 
 
-def get_total_lexemes(language, data_type, doPrint=True):
+def get_total_lexemes(language, data_type, do_print=True):
     """
     Get the total number of lexemes for a given language and data type from Wikidata.
 
@@ -344,13 +347,13 @@ def get_total_lexemes(language, data_type, doPrint=True):
 
         output_template = ""
         if language:
-            output_template += f"\nLanguage: {language}\n"
+            output_template += f"\nLanguage: {language.capitalize()}\n"
 
         if data_type:
             output_template += f"Data type: {data_type}\n"
 
-        output_template += f"Total number of lexemes: {total_lexemes}\n"
-        if doPrint:
+        output_template += f"Total number of lexemes: {total_lexemes:,}\n"
+        if do_print:
             print(output_template)
 
         return total_lexemes
@@ -399,7 +402,9 @@ def total_wrapper(
                 True  # flag to check if it's the first data type for the language
             )
             for dt in data_types:
-                total_lexemes = get_total_lexemes(lang, dt, False)
+                total_lexemes = get_total_lexemes(
+                    language=lang, data_type=dt, do_print=False
+                )
                 total_lexemes = (
                     f"{total_lexemes:,}" if total_lexemes is not None else "N/A"
                 )
@@ -413,16 +418,16 @@ def total_wrapper(
             print()
 
     elif language is not None and data_type is None:
-        print_total_lexemes(language)
+        print_total_lexemes(language=language)
 
     elif language is not None and not all_bool:
-        get_total_lexemes(language, data_type)
+        get_total_lexemes(language=language, data_type=data_type)
 
     elif language is not None:
         print(
-            f"You have already specified language {language} and data type {data_type} - no need to specify --all."
+            f"You have already specified language {language.capitalize()} and data type {data_type} - no need to specify --all."
         )
-        get_total_lexemes(language, data_type)
+        get_total_lexemes(language=language, data_type=data_type)
 
     else:
         raise ValueError("Invalid input or missing information")
diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py
@@ -64,10 +64,12 @@ def generate_emoji(language, output_dir: str = None):
             Path(__file__).parent / "cldr-annotations-full" / "annotations"
         )
         if iso in os.listdir(path_to_cldr_annotations):
-            print(f"Emoji Generation for language {language} is supported")
+            print(f"Emoji Generation for language {language.capitalize()} is supported")
 
         else:
-            print(f"Emoji Generation for language {language} is not supported")
+            print(
+                f"Emoji Generation for language {language.capitalize()} is not supported"
+            )
             return
 
         updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
@@ -82,6 +84,6 @@ def generate_emoji(language, output_dir: str = None):
                 file_path=output_dir,
                 formatted_data=emoji_keywords_dict,
                 query_data_in_use=True,
-                language=language.capitalize(),
+                language=language,
                 data_type=DATA_TYPE,
             )