Update generate_emoji_keywords.py and created common_arg_parser.py fo…

…r modularity
scribe-org · Oct 17, 2024 · 5a47465 · 5a47465
1 parent 08a6986
commit 5a47465
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 64 deletions.
diff --git a/..._data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/..._data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py
@@ -1,56 +1,43 @@
 """
-* Copyright (C) 2024 Scribe
-*
-* This program is free software: you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation, either version 3 of the License, or
-* (at your option) any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Generates keyword-emoji relationships from a selection of Hindustani words.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->.
 """
 
-import argparse
 from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword
+from scribe_data.unicode.common_arg_parser import setup_arg_parser
 
-# Define the main language
-LANGUAGE = "Hindustani"  # Change to a grouped language if needed
-emojis_per_keyword = 3
 
-# Set up the argument parser
-parser = argparse.ArgumentParser(
-    description="Generate emoji keywords for a specific language."
-)
-parser.add_argument(
-    "--file-path", required=True, help="Path to save the generated emoji keywords."
-)
-parser.add_argument(
-    "--sub-languages",
-    nargs="*",
-    help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.",
-)
-parser.add_argument(
-    "--gender",
-    choices=["male", "female", "neutral"],
-    help="Specify the gender for emoji customization.",
-)
-parser.add_argument(
-    "--region", help="Specify the region for emoji customization (e.g., US, IN)."
-)
+LANGUAGE = "Hindustani"
+
+
+# Set up the argument parser by calling the imported function.
+parser = setup_arg_parser()
 
-# Parse the command-line arguments
+# Parse the command-line arguments.
 args = parser.parse_args()
 
-# Call the generate_emoji_keyword function with optional parameters
+# Call the generate_emoji_keyword function with optional parameters.
 generate_emoji_keyword(
     LANGUAGE,
-    emojis_per_keyword,
     args.file_path,
+    emojis_per_keyword=args.emojis_per_keyword,
     gender=args.gender,
     region=args.region,
     sub_languages=args.sub_languages,

diff --git a/src/scribe_data/unicode/common_arg_parser.py b/src/scribe_data/unicode/common_arg_parser.py
@@ -0,0 +1,51 @@
+"""
+ common argument parsing function
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->.
+"""
+
+import argparse
+
+
+def setup_arg_parser():
+    parser = argparse.ArgumentParser(
+        description="Generate emoji keywords for a specific language."
+    )
+    parser.add_argument(
+        "--file-path", required=True, help="Path to save the generated emoji keywords."
+    )
+    parser.add_argument(
+        "--sub-languages",
+        nargs="*",
+        help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.",
+    )
+    parser.add_argument(
+        "--gender",
+        choices=["male", "female", "neutral"],
+        help="Specify the gender for emoji customization.",
+    )
+    parser.add_argument(
+        "--region", help="Specify the region for emoji customization (e.g., US, IN)."
+    )
+    parser.add_argument(
+        "--emojis-per-keyword",
+        type=int,
+        help="Number of emojis to generate per keyword.",
+    )
+    return parser
diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py
@@ -1,53 +1,63 @@
+"""
+Centralize emoji keyword generation logic
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->.
+"""
+
 from .process_unicode import gen_emoji_lexicon
 from scribe_data.utils import export_formatted_data
 
 
 def generate_emoji_keyword(
     language,
-    emojis_per_keyword,
     file_path,
+    emojis_per_keyword=3,  # default value for emojis_per_keyword
     gender=None,
     region=None,
     sub_languages=None,
 ):
-    """
-    Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu).
-
-    Parameters:
-    - language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu).
-    - emojis_per_keyword (int): Number of emojis to associate with each keyword.
-    - file_path (str): The path to the file where the generated emoji keywords will be saved.
-    - gender (str): Gender-based customization for emojis (e.g., "male", "female").
-    - region (str): Regional customization for emojis (e.g., "US", "JP").
-    - sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed.
-    """
-
-    # Define grouped languages and their sub-languages
+    # Define grouped languages and their sub-languages.
     grouped_languages = {
         "Hindustani": ["Hindi", "Urdu"],
         "Norwegian": ["Bokmål", "Nynorsk"],
-        # Add more grouped languages as needed
+        # Add more grouped languages as needed.
     }
 
-    # If the language is a grouped language, handle its sub-languages
+    # If the language is a grouped language, handle its sub-languages.
     if language in grouped_languages:
-        # If specific sub-languages are provided, only process those
+        # If specific sub-languages are provided, only process those.
         sub_languages_to_process = sub_languages or grouped_languages[language]
 
         for sub_lang in sub_languages_to_process:
             print(f"Processing sub-language: {sub_lang}")
 
-            # Generate emoji keywords for the sub-language
+            # Generate emoji keywords for the sub-language.
             emoji_keywords_dict = gen_emoji_lexicon(
                 language=sub_lang,
                 emojis_per_keyword=emojis_per_keyword,
                 gender=gender,
                 region=region,
             )
 
-            # Export the generated emoji keywords for the sub-language
+            # Export the generated emoji keywords for the sub-language.
             if emoji_keywords_dict:
-                # Save the file with the sub-language included in the file name
+                # Save the file with the sub-language included in the file name.
                 export_file_path = f"{file_path}_{sub_lang}.json"
                 export_formatted_data(
                     file_path=export_file_path,
@@ -57,17 +67,17 @@ def generate_emoji_keyword(
                     data_type="emoji-keywords",
                 )
 
-    # If it's not a grouped language, process it as a single language
+    # If it's not a grouped language, process it as a single language.
     else:
-        # Generate emoji keywords for the given language
+        # generate emoji keywords for the given language.
         emoji_keywords_dict = gen_emoji_lexicon(
             language=language,
             emojis_per_keyword=emojis_per_keyword,
             gender=gender,
             region=region,
         )
 
-        # Export the generated emoji keywords for the language
+        # Export the generated emoji keywords for the language.
         if emoji_keywords_dict:
             export_formatted_data(
                 file_path=file_path,