From 8238e27b21284719316af796a1ab4674c036637e Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 01/11] Update generate_emoji_keyword.py with additional changes --- src/scribe_data/unicode/common_arg_parser.py | 51 +++++++ .../unicode/generate_emoji_keyword.py | 78 +++++++++++ src/scribe_data/unicode/process_unicode.py | 124 ++++++++++++++++++ 3 files changed, 253 insertions(+) create mode 100644 src/scribe_data/unicode/common_arg_parser.py create mode 100644 src/scribe_data/unicode/generate_emoji_keyword.py diff --git a/src/scribe_data/unicode/common_arg_parser.py b/src/scribe_data/unicode/common_arg_parser.py new file mode 100644 index 000000000..922a88bad --- /dev/null +++ b/src/scribe_data/unicode/common_arg_parser.py @@ -0,0 +1,51 @@ +""" +<<<<<<< HEAD +Generates keyword-emoji relationships from a selection of Hindi words, ensuring Urdu words are excluded. + +.. raw:: html + +""" + +import argparse + +def setup_arg_parser(): + parser = argparse.ArgumentParser( + description="Generate emoji keywords for a specific language." + ) + parser.add_argument( + "--file-path", required=True, help="Path to save the generated emoji keywords." + ) + parser.add_argument( + "--sub-languages", + nargs="*", + help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.", + ) + parser.add_argument( + "--gender", + choices=["male", "female", "neutral"], + help="Specify the gender for emoji customization.", + ) + parser.add_argument( + "--region", help="Specify the region for emoji customization (e.g., US, IN)." + ) + parser.add_argument( + "--emojis-per-keyword", + type=int, + help="Number of emojis to generate per keyword.", + ) + return parser diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py new file mode 100644 index 000000000..df20c2110 --- /dev/null +++ b/src/scribe_data/unicode/generate_emoji_keyword.py @@ -0,0 +1,78 @@ +from .process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data + + +def generate_emoji_keyword( + language, + emojis_per_keyword, + file_path, + gender=None, + region=None, + sub_languages=None, +): + """ + Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu). + + Parameters: + - language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu). + - emojis_per_keyword (int): Number of emojis to associate with each keyword. + - file_path (str): The path to the file where the generated emoji keywords will be saved. + - gender (str): Gender-based customization for emojis (e.g., "male", "female"). + - region (str): Regional customization for emojis (e.g., "US", "JP"). + - sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed. + """ + + # Define grouped languages and their sub-languages + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed + } + + # If the language is a grouped language, handle its sub-languages + if language in grouped_languages: + # If specific sub-languages are provided, only process those + sub_languages_to_process = sub_languages or grouped_languages[language] + + for sub_lang in sub_languages_to_process: + print(f"Processing sub-language: {sub_lang}") + + # Generate emoji keywords for the sub-language + emoji_keywords_dict = gen_emoji_lexicon( + language=sub_lang, + emojis_per_keyword=emojis_per_keyword, + gender=gender, + region=region, + ) + + # Export the generated emoji keywords for the sub-language + if emoji_keywords_dict: + # Save the file with the sub-language included in the file name + export_file_path = f"{file_path}_{sub_lang}.json" + export_formatted_data( + file_path=export_file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=sub_lang, + data_type="emoji-keywords", + ) + + # If it's not a grouped language, process it as a single language + else: + # Generate emoji keywords for the given language + emoji_keywords_dict = gen_emoji_lexicon( + language=language, + emojis_per_keyword=emojis_per_keyword, + gender=gender, + region=region, + ) + + # Export the generated emoji keywords for the language + if emoji_keywords_dict: + export_formatted_data( + file_path=file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language, + data_type="emoji-keywords", + ) diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 223a40fec..319adcf2c 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -50,6 +50,130 @@ def gen_emoji_lexicon( language: str, emojis_per_keyword: int, + gender=None, + region=None, +): + """ + Generate emoji lexicon for a given language with optional gender and region customizations. + + Parameters: + - language (str): The language for which emoji keywords are generated. + - emojis_per_keyword (int): Number of emojis to associate with each keyword. + - gender (str, optional): Gender-based customization for emojis (e.g., "male", "female"). + - region (str, optional): Regional customization for emojis (e.g., "US", "JP"). + + Returns: + - dict: A dictionary containing emoji keywords and associated emojis. + """ + + # Initialize the emoji dictionary + emoji_keywords_dict = {} + + # Define grouped languages and their specific languages + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed + } + + # Function to add emojis based on gender and region + def add_emojis_for_gender_region(lang, gender, region): + """ + This function generates a set of emojis based on the specified language, + gender, and region. It aims to ensure that the emojis are relevant and + culturally appropriate for the given context. + + Parameters: + - lang (str): The language for which emojis are being generated. This could + affect the representation of certain emojis or their usage. + - gender (str): A string that indicates the gender for which emojis should + be selected. Accepted values are "male" and "female". + - region (str): A string representing the geographical region, which can + influence the selection of emojis to include those that are culturally + significant or popular in that area (e.g., "IN" for India). + + Implementation Details: + 1. **Placeholder Logic**: + - The function currently contains placeholder comments indicating where + the actual logic for selecting emojis should be implemented. This allows + contributors to easily identify where to add the necessary emoji-selection + logic based on gender and region. + + 2. **Gender-Based Emoji Selection**: + - The function checks the gender parameter. Depending on whether the + gender is "male" or "female", different sets of emojis should be + included. For example, if the gender is "male", the logic for selecting + male-specific emojis will be executed. Similarly, for "female", + female-specific emojis should be considered. + - The `pass` statement is a placeholder for the logic that should be + implemented later. This could involve referencing a predefined list of + emojis or generating emojis based on specific criteria related to gender. + + 3. **Region-Based Emoji Selection**: + - The function also checks the region parameter. If the region is "IN", + the logic for selecting emojis that are relevant to India will be executed. + - Just like with gender, the `pass` statement indicates where to add + this logic. The selected emojis should reflect cultural significance or + popular usage in the specified region. + + 4. **Returning Emoji Data**: + - The function is designed to return a dictionary containing the emojis that + have been selected based on the provided parameters. The current implementation + returns an empty dictionary, which should be replaced with the actual logic to + populate it with emoji data generated from the gender and region logic. + + Need for Modularity: + - As the project scales and the emoji selection logic becomes more complex, + it is essential to keep the code modular. This means separating different + functionalities into distinct modules or files. + + + This function serves as a foundational component for generating + emojis tailored to specific user demographics, and implementing it in a + modular fashion will support future enhancements and maintenance. + """ + + if gender == "male": + # Include male-specific emojis logic + pass + elif gender == "female": + # Include female-specific emojis logic + pass + + if region == "IN": + # Include region-specific emojis logic + pass + + # Return any generated emoji data for the given language + return {} + + # Check if the language is a grouped language + for grouped_language, sub_languages in grouped_languages.items(): + if language == grouped_language: + # Process each sub-language in the grouped language + for sub_lang in sub_languages: + print(f"Processing sub-language: {sub_lang}") + # Add emojis for each sub-language based on gender and region + emojis = add_emojis_for_gender_region(sub_lang, gender, region) + emoji_keywords_dict[sub_lang] = emojis # Add to the dictionary + + # If you want to combine results for the grouped language + emoji_keywords_dict[grouped_language] = emoji_keywords_dict + + return emoji_keywords_dict # Return the dict for grouped languages + + # If it's not a grouped language, handle it as a single language + else: + # Generate emojis for the given single language + emojis = add_emojis_for_gender_region(language, gender, region) + emoji_keywords_dict[language] = emojis + + return emoji_keywords_dict + + +def gen_emoji_lexicon_old( + language: str, + emojis_per_keyword: int, ): """ Generates a dictionary of keywords (keys) and emoji unicode(s) associated with them (values). From 8b05fb85d2ce71ad69b6ca633d75543dfdda9343 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Thu, 17 Oct 2024 14:05:20 +0100 Subject: [PATCH 02/11] Update generate_emoji_keywords.py and created common_arg_parser.py for modularity --- src/scribe_data/unicode/common_arg_parser.py | 5 +- .../unicode/generate_emoji_keyword.py | 56 +++++++++++-------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/scribe_data/unicode/common_arg_parser.py b/src/scribe_data/unicode/common_arg_parser.py index 922a88bad..aca5f1618 100644 --- a/src/scribe_data/unicode/common_arg_parser.py +++ b/src/scribe_data/unicode/common_arg_parser.py @@ -1,6 +1,5 @@ """ -<<<<<<< HEAD -Generates keyword-emoji relationships from a selection of Hindi words, ensuring Urdu words are excluded. +Common parser .. raw:: html + """ import argparse diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py index df20c2110..e728473ce 100644 --- a/src/scribe_data/unicode/generate_emoji_keyword.py +++ b/src/scribe_data/unicode/generate_emoji_keyword.py @@ -1,43 +1,53 @@ +""" +Centralize emoji keyword generation logic + +.. raw:: html + . +""" + from .process_unicode import gen_emoji_lexicon from scribe_data.utils import export_formatted_data def generate_emoji_keyword( language, - emojis_per_keyword, file_path, + emojis_per_keyword=3, # default value for emojis_per_keyword gender=None, region=None, sub_languages=None, ): - """ - Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu). - - Parameters: - - language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu). - - emojis_per_keyword (int): Number of emojis to associate with each keyword. - - file_path (str): The path to the file where the generated emoji keywords will be saved. - - gender (str): Gender-based customization for emojis (e.g., "male", "female"). - - region (str): Regional customization for emojis (e.g., "US", "JP"). - - sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed. - """ - - # Define grouped languages and their sub-languages + # Define grouped languages and their sub-languages. grouped_languages = { "Hindustani": ["Hindi", "Urdu"], "Norwegian": ["Bokmål", "Nynorsk"], - # Add more grouped languages as needed + # Add more grouped languages as needed. } - # If the language is a grouped language, handle its sub-languages + # If the language is a grouped language, handle its sub-languages. if language in grouped_languages: - # If specific sub-languages are provided, only process those + # If specific sub-languages are provided, only process those. sub_languages_to_process = sub_languages or grouped_languages[language] for sub_lang in sub_languages_to_process: print(f"Processing sub-language: {sub_lang}") - # Generate emoji keywords for the sub-language + # Generate emoji keywords for the sub-language. emoji_keywords_dict = gen_emoji_lexicon( language=sub_lang, emojis_per_keyword=emojis_per_keyword, @@ -45,9 +55,9 @@ def generate_emoji_keyword( region=region, ) - # Export the generated emoji keywords for the sub-language + # Export the generated emoji keywords for the sub-language. if emoji_keywords_dict: - # Save the file with the sub-language included in the file name + # Save the file with the sub-language included in the file name. export_file_path = f"{file_path}_{sub_lang}.json" export_formatted_data( file_path=export_file_path, @@ -57,9 +67,9 @@ def generate_emoji_keyword( data_type="emoji-keywords", ) - # If it's not a grouped language, process it as a single language + # If it's not a grouped language, process it as a single language. else: - # Generate emoji keywords for the given language + # generate emoji keywords for the given language. emoji_keywords_dict = gen_emoji_lexicon( language=language, emojis_per_keyword=emojis_per_keyword, @@ -67,7 +77,7 @@ def generate_emoji_keyword( region=region, ) - # Export the generated emoji keywords for the language + # Export the generated emoji keywords for the language. if emoji_keywords_dict: export_formatted_data( file_path=file_path, From 036c60db8cd431a9c2e252cc39412fc371378b44 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 03/11] Update generate_emoji_keyword.py with additional changes --- .../unicode/generate_emoji_keyword.py | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py index e728473ce..5bf7768d6 100644 --- a/src/scribe_data/unicode/generate_emoji_keyword.py +++ b/src/scribe_data/unicode/generate_emoji_keyword.py @@ -32,6 +32,7 @@ def generate_emoji_keyword( region=None, sub_languages=None, ): + # Define grouped languages and their sub-languages. grouped_languages = { "Hindustani": ["Hindi", "Urdu"], @@ -42,6 +43,18 @@ def generate_emoji_keyword( # If the language is a grouped language, handle its sub-languages. if language in grouped_languages: # If specific sub-languages are provided, only process those. + + # Define grouped languages and their sub-languages. + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed. + } + + # If the language is a grouped language, handle its sub-languages. + if language in grouped_languages: + # If specific sub-languages are provided, only process those. + sub_languages_to_process = sub_languages or grouped_languages[language] for sub_lang in sub_languages_to_process: @@ -55,9 +68,15 @@ def generate_emoji_keyword( region=region, ) + # Export the generated emoji keywords for the sub-language. if emoji_keywords_dict: # Save the file with the sub-language included in the file name. + + # Export the generated emoji keywords for the sub-language. + if emoji_keywords_dict: + # Save the file with the sub-language included in the file name. + export_file_path = f"{file_path}_{sub_lang}.json" export_formatted_data( file_path=export_file_path, @@ -67,22 +86,27 @@ def generate_emoji_keyword( data_type="emoji-keywords", ) + # If it's not a grouped language, process it as a single language. + else: - # generate emoji keywords for the given language. - emoji_keywords_dict = gen_emoji_lexicon( + # Generate emoji keywords for the given language. + + emoji_keywords_dict = gen_emoji_lexicon( language=language, emojis_per_keyword=emojis_per_keyword, gender=gender, region=region, ) + # Export the generated emoji keywords for the language. - if emoji_keywords_dict: + + if emoji_keywords_dict: export_formatted_data( file_path=file_path, formatted_data=emoji_keywords_dict, query_data_in_use=True, language=language, data_type="emoji-keywords", - ) +) \ No newline at end of file From 53206be030060d2c05cb68448d039e81e2ab64b9 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Thu, 17 Oct 2024 16:53:59 +0100 Subject: [PATCH 04/11] Update src/scribe_data/unicode/generate_emoji_keyword.py --- .../unicode/generate_emoji_keyword.py | 57 +++++++++---------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py index 5bf7768d6..7a28e595a 100644 --- a/src/scribe_data/unicode/generate_emoji_keyword.py +++ b/src/scribe_data/unicode/generate_emoji_keyword.py @@ -32,7 +32,6 @@ def generate_emoji_keyword( region=None, sub_languages=None, ): - # Define grouped languages and their sub-languages. grouped_languages = { "Hindustani": ["Hindi", "Urdu"], @@ -44,12 +43,12 @@ def generate_emoji_keyword( if language in grouped_languages: # If specific sub-languages are provided, only process those. - # Define grouped languages and their sub-languages. - grouped_languages = { - "Hindustani": ["Hindi", "Urdu"], - "Norwegian": ["Bokmål", "Nynorsk"], - # Add more grouped languages as needed. - } + # Define grouped languages and their sub-languages. + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed. + } # If the language is a grouped language, handle its sub-languages. if language in grouped_languages: @@ -68,45 +67,41 @@ def generate_emoji_keyword( region=region, ) - # Export the generated emoji keywords for the sub-language. if emoji_keywords_dict: # Save the file with the sub-language included in the file name. - # Export the generated emoji keywords for the sub-language. - if emoji_keywords_dict: - # Save the file with the sub-language included in the file name. - - export_file_path = f"{file_path}_{sub_lang}.json" - export_formatted_data( - file_path=export_file_path, - formatted_data=emoji_keywords_dict, - query_data_in_use=True, - language=sub_lang, - data_type="emoji-keywords", - ) + # Export the generated emoji keywords for the sub-language. + if emoji_keywords_dict: + # Save the file with the sub-language included in the file name. + export_file_path = f"{file_path}_{sub_lang}.json" + export_formatted_data( + file_path=export_file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=sub_lang, + data_type="emoji-keywords", + ) # If it's not a grouped language, process it as a single language. - + else: # Generate emoji keywords for the given language. - emoji_keywords_dict = gen_emoji_lexicon( + emoji_keywords_dict = gen_emoji_lexicon( language=language, emojis_per_keyword=emojis_per_keyword, gender=gender, region=region, ) - # Export the generated emoji keywords for the language. - if emoji_keywords_dict: - export_formatted_data( - file_path=file_path, - formatted_data=emoji_keywords_dict, - query_data_in_use=True, - language=language, - data_type="emoji-keywords", -) \ No newline at end of file + export_formatted_data( + file_path=file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language, + data_type="emoji-keywords", + ) From a72ab702a4fe6f3bd854b203633ed130ecc5cf30 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Sat, 19 Oct 2024 21:57:13 +0100 Subject: [PATCH 05/11] Updated centralized emoji logic --- src/scribe_data/unicode/common_arg_parser.py | 50 ------- .../unicode/generate_emoji_keyword.py | 123 +++++++---------- src/scribe_data/unicode/process_unicode.py | 124 ------------------ 3 files changed, 49 insertions(+), 248 deletions(-) delete mode 100644 src/scribe_data/unicode/common_arg_parser.py diff --git a/src/scribe_data/unicode/common_arg_parser.py b/src/scribe_data/unicode/common_arg_parser.py deleted file mode 100644 index aca5f1618..000000000 --- a/src/scribe_data/unicode/common_arg_parser.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Common parser - -.. raw:: html - . + --> """ -from .process_unicode import gen_emoji_lexicon -from scribe_data.utils import export_formatted_data - +import argparse +import json +from pathlib import Path -def generate_emoji_keyword( - language, - file_path, - emojis_per_keyword=3, # default value for emojis_per_keyword - gender=None, - region=None, - sub_languages=None, -): - # Define grouped languages and their sub-languages. - grouped_languages = { - "Hindustani": ["Hindi", "Urdu"], - "Norwegian": ["Bokmål", "Nynorsk"], - # Add more grouped languages as needed. - } +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data - # If the language is a grouped language, handle its sub-languages. - if language in grouped_languages: - # If specific sub-languages are provided, only process those. +DATA_TYPE = "emoji-keywords" +EMOJIS_PER_KEYWORD = 3 - # Define grouped languages and their sub-languages. - grouped_languages = { - "Hindustani": ["Hindi", "Urdu"], - "Norwegian": ["Bokmål", "Nynorsk"], - # Add more grouped languages as needed. - } +# Define the path to the languages JSON file. +LANGUAGES_JSON = Path(__file__).parent / "supported_language.json" - # If the language is a grouped language, handle its sub-languages. - if language in grouped_languages: - # If specific sub-languages are provided, only process those. - sub_languages_to_process = sub_languages or grouped_languages[language] +def main(file_path): + # Read the language codes and names from the JSON. + with open(LANGUAGES_JSON, "r", encoding="utf-8") as f: + languages = json.load(f) - for sub_lang in sub_languages_to_process: - print(f"Processing sub-language: {sub_lang}") + for code, language in languages.items(): + print(f"Generating emoji keywords for {language} ({code})...") - # Generate emoji keywords for the sub-language. - emoji_keywords_dict = gen_emoji_lexicon( - language=sub_lang, - emojis_per_keyword=emojis_per_keyword, - gender=gender, - region=region, - ) + language_dir = file_path / f"{language}" + emoji_dir = language_dir / "emoji_keywords" + init_file = emoji_dir / "__init__.py" - # Export the generated emoji keywords for the sub-language. - if emoji_keywords_dict: - # Save the file with the sub-language included in the file name. + # Ensure that the emoji_keywords directory and __init__.py file exist. + emoji_dir.mkdir(parents=True, exist_ok=True) - # Export the generated emoji keywords for the sub-language. - if emoji_keywords_dict: - # Save the file with the sub-language included in the file name. + if not init_file.exists(): + # Create the __init__.py file if it doesn't exist. + init_file.touch() + print(f"Created __init__.py in {emoji_dir}.") - export_file_path = f"{file_path}_{sub_lang}.json" - export_formatted_data( - file_path=export_file_path, - formatted_data=emoji_keywords_dict, - query_data_in_use=True, - language=sub_lang, - data_type="emoji-keywords", - ) + if emoji_keywords_dict := gen_emoji_lexicon( + language=language, + emojis_per_keyword=EMOJIS_PER_KEYWORD, + ): + export_formatted_data( + file_path=emoji_dir / f"{code}_emoji_keywords.json", + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language, + data_type=DATA_TYPE, + ) + print(f"Emoji keywords for {language} saved.\n") - # If it's not a grouped language, process it as a single language. - else: - # Generate emoji keywords for the given language. +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--file-path", required=True, help="Path to save the emoji keywords files." + ) + args = parser.parse_args() - emoji_keywords_dict = gen_emoji_lexicon( - language=language, - emojis_per_keyword=emojis_per_keyword, - gender=gender, - region=region, - ) + # Ensure the directory exists. + output_dir = Path(args.file_path) + output_dir.mkdir(parents=True, exist_ok=True) - # Export the generated emoji keywords for the language. - if emoji_keywords_dict: - export_formatted_data( - file_path=file_path, - formatted_data=emoji_keywords_dict, - query_data_in_use=True, - language=language, - data_type="emoji-keywords", - ) + # Call the main function. + main(output_dir) diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 319adcf2c..223a40fec 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -50,130 +50,6 @@ def gen_emoji_lexicon( language: str, emojis_per_keyword: int, - gender=None, - region=None, -): - """ - Generate emoji lexicon for a given language with optional gender and region customizations. - - Parameters: - - language (str): The language for which emoji keywords are generated. - - emojis_per_keyword (int): Number of emojis to associate with each keyword. - - gender (str, optional): Gender-based customization for emojis (e.g., "male", "female"). - - region (str, optional): Regional customization for emojis (e.g., "US", "JP"). - - Returns: - - dict: A dictionary containing emoji keywords and associated emojis. - """ - - # Initialize the emoji dictionary - emoji_keywords_dict = {} - - # Define grouped languages and their specific languages - grouped_languages = { - "Hindustani": ["Hindi", "Urdu"], - "Norwegian": ["Bokmål", "Nynorsk"], - # Add more grouped languages as needed - } - - # Function to add emojis based on gender and region - def add_emojis_for_gender_region(lang, gender, region): - """ - This function generates a set of emojis based on the specified language, - gender, and region. It aims to ensure that the emojis are relevant and - culturally appropriate for the given context. - - Parameters: - - lang (str): The language for which emojis are being generated. This could - affect the representation of certain emojis or their usage. - - gender (str): A string that indicates the gender for which emojis should - be selected. Accepted values are "male" and "female". - - region (str): A string representing the geographical region, which can - influence the selection of emojis to include those that are culturally - significant or popular in that area (e.g., "IN" for India). - - Implementation Details: - 1. **Placeholder Logic**: - - The function currently contains placeholder comments indicating where - the actual logic for selecting emojis should be implemented. This allows - contributors to easily identify where to add the necessary emoji-selection - logic based on gender and region. - - 2. **Gender-Based Emoji Selection**: - - The function checks the gender parameter. Depending on whether the - gender is "male" or "female", different sets of emojis should be - included. For example, if the gender is "male", the logic for selecting - male-specific emojis will be executed. Similarly, for "female", - female-specific emojis should be considered. - - The `pass` statement is a placeholder for the logic that should be - implemented later. This could involve referencing a predefined list of - emojis or generating emojis based on specific criteria related to gender. - - 3. **Region-Based Emoji Selection**: - - The function also checks the region parameter. If the region is "IN", - the logic for selecting emojis that are relevant to India will be executed. - - Just like with gender, the `pass` statement indicates where to add - this logic. The selected emojis should reflect cultural significance or - popular usage in the specified region. - - 4. **Returning Emoji Data**: - - The function is designed to return a dictionary containing the emojis that - have been selected based on the provided parameters. The current implementation - returns an empty dictionary, which should be replaced with the actual logic to - populate it with emoji data generated from the gender and region logic. - - Need for Modularity: - - As the project scales and the emoji selection logic becomes more complex, - it is essential to keep the code modular. This means separating different - functionalities into distinct modules or files. - - - This function serves as a foundational component for generating - emojis tailored to specific user demographics, and implementing it in a - modular fashion will support future enhancements and maintenance. - """ - - if gender == "male": - # Include male-specific emojis logic - pass - elif gender == "female": - # Include female-specific emojis logic - pass - - if region == "IN": - # Include region-specific emojis logic - pass - - # Return any generated emoji data for the given language - return {} - - # Check if the language is a grouped language - for grouped_language, sub_languages in grouped_languages.items(): - if language == grouped_language: - # Process each sub-language in the grouped language - for sub_lang in sub_languages: - print(f"Processing sub-language: {sub_lang}") - # Add emojis for each sub-language based on gender and region - emojis = add_emojis_for_gender_region(sub_lang, gender, region) - emoji_keywords_dict[sub_lang] = emojis # Add to the dictionary - - # If you want to combine results for the grouped language - emoji_keywords_dict[grouped_language] = emoji_keywords_dict - - return emoji_keywords_dict # Return the dict for grouped languages - - # If it's not a grouped language, handle it as a single language - else: - # Generate emojis for the given single language - emojis = add_emojis_for_gender_region(language, gender, region) - emoji_keywords_dict[language] = emojis - - return emoji_keywords_dict - - -def gen_emoji_lexicon_old( - language: str, - emojis_per_keyword: int, ): """ Generates a dictionary of keywords (keys) and emoji unicode(s) associated with them (values). From 1209620cd23942dc188c2f8ddd7e9bab337c9ea7 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 06/11] Update generate_emoji_keyword.py with additional changes --- src/scribe_data/unicode/process_unicode.py | 124 +++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 223a40fec..319adcf2c 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -50,6 +50,130 @@ def gen_emoji_lexicon( language: str, emojis_per_keyword: int, + gender=None, + region=None, +): + """ + Generate emoji lexicon for a given language with optional gender and region customizations. + + Parameters: + - language (str): The language for which emoji keywords are generated. + - emojis_per_keyword (int): Number of emojis to associate with each keyword. + - gender (str, optional): Gender-based customization for emojis (e.g., "male", "female"). + - region (str, optional): Regional customization for emojis (e.g., "US", "JP"). + + Returns: + - dict: A dictionary containing emoji keywords and associated emojis. + """ + + # Initialize the emoji dictionary + emoji_keywords_dict = {} + + # Define grouped languages and their specific languages + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed + } + + # Function to add emojis based on gender and region + def add_emojis_for_gender_region(lang, gender, region): + """ + This function generates a set of emojis based on the specified language, + gender, and region. It aims to ensure that the emojis are relevant and + culturally appropriate for the given context. + + Parameters: + - lang (str): The language for which emojis are being generated. This could + affect the representation of certain emojis or their usage. + - gender (str): A string that indicates the gender for which emojis should + be selected. Accepted values are "male" and "female". + - region (str): A string representing the geographical region, which can + influence the selection of emojis to include those that are culturally + significant or popular in that area (e.g., "IN" for India). + + Implementation Details: + 1. **Placeholder Logic**: + - The function currently contains placeholder comments indicating where + the actual logic for selecting emojis should be implemented. This allows + contributors to easily identify where to add the necessary emoji-selection + logic based on gender and region. + + 2. **Gender-Based Emoji Selection**: + - The function checks the gender parameter. Depending on whether the + gender is "male" or "female", different sets of emojis should be + included. For example, if the gender is "male", the logic for selecting + male-specific emojis will be executed. Similarly, for "female", + female-specific emojis should be considered. + - The `pass` statement is a placeholder for the logic that should be + implemented later. This could involve referencing a predefined list of + emojis or generating emojis based on specific criteria related to gender. + + 3. **Region-Based Emoji Selection**: + - The function also checks the region parameter. If the region is "IN", + the logic for selecting emojis that are relevant to India will be executed. + - Just like with gender, the `pass` statement indicates where to add + this logic. The selected emojis should reflect cultural significance or + popular usage in the specified region. + + 4. **Returning Emoji Data**: + - The function is designed to return a dictionary containing the emojis that + have been selected based on the provided parameters. The current implementation + returns an empty dictionary, which should be replaced with the actual logic to + populate it with emoji data generated from the gender and region logic. + + Need for Modularity: + - As the project scales and the emoji selection logic becomes more complex, + it is essential to keep the code modular. This means separating different + functionalities into distinct modules or files. + + + This function serves as a foundational component for generating + emojis tailored to specific user demographics, and implementing it in a + modular fashion will support future enhancements and maintenance. + """ + + if gender == "male": + # Include male-specific emojis logic + pass + elif gender == "female": + # Include female-specific emojis logic + pass + + if region == "IN": + # Include region-specific emojis logic + pass + + # Return any generated emoji data for the given language + return {} + + # Check if the language is a grouped language + for grouped_language, sub_languages in grouped_languages.items(): + if language == grouped_language: + # Process each sub-language in the grouped language + for sub_lang in sub_languages: + print(f"Processing sub-language: {sub_lang}") + # Add emojis for each sub-language based on gender and region + emojis = add_emojis_for_gender_region(sub_lang, gender, region) + emoji_keywords_dict[sub_lang] = emojis # Add to the dictionary + + # If you want to combine results for the grouped language + emoji_keywords_dict[grouped_language] = emoji_keywords_dict + + return emoji_keywords_dict # Return the dict for grouped languages + + # If it's not a grouped language, handle it as a single language + else: + # Generate emojis for the given single language + emojis = add_emojis_for_gender_region(language, gender, region) + emoji_keywords_dict[language] = emojis + + return emoji_keywords_dict + + +def gen_emoji_lexicon_old( + language: str, + emojis_per_keyword: int, ): """ Generates a dictionary of keywords (keys) and emoji unicode(s) associated with them (values). From c152a3111af8aeccc968c0012bbecfec0256de96 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 07/11] Update generate_emoji_keyword.py with additional changes --- .../emoji_keywords/generate_emoji_keywords.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py new file mode 100644 index 000000000..acc7555ab --- /dev/null +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py @@ -0,0 +1,46 @@ +""" +<<<<<<< HEAD +Generates keyword-emoji relationships from a selection of Hindi words, ensuring Urdu words are excluded. + +.. raw:: html + +""" + +from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword +from scribe_data.unicode.common_arg_parser import setup_arg_parser + +LANGUAGE = "Hindi" + +# Define the main language. +LANGUAGE = "Hindustani" # Grouped language with sub-languages like Hindi, Urdu + +# Set up the argument parser by calling the imported function. +parser = setup_arg_parser() + +# Parse the command-line arguments. +args = parser.parse_args() + +# Call the generate_emoji_keyword function with optional parameters. +generate_emoji_keyword( + LANGUAGE, + args.file_path, + emojis_per_keyword=args.emojis_per_keyword, + gender=args.gender, + region=args.region, + sub_languages=args.sub_languages, +) \ No newline at end of file From f0feaac50912edeef35e8fa52715ebfeaadfd596 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 08/11] Update generate_emoji_keyword.py with additional changes --- .../Hindi/emoji_keywords/generate_emoji_keywords.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py index acc7555ab..2749b9084 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py @@ -1,5 +1,4 @@ """ -<<<<<<< HEAD Generates keyword-emoji relationships from a selection of Hindi words, ensuring Urdu words are excluded. .. raw:: html @@ -24,10 +23,8 @@ from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword from scribe_data.unicode.common_arg_parser import setup_arg_parser -LANGUAGE = "Hindi" - # Define the main language. -LANGUAGE = "Hindustani" # Grouped language with sub-languages like Hindi, Urdu +LANGUAGE = "Hindi" # Set up the argument parser by calling the imported function. parser = setup_arg_parser() @@ -43,4 +40,4 @@ gender=args.gender, region=args.region, sub_languages=args.sub_languages, -) \ No newline at end of file +) From a7339557100085d853fd369d600f80913eb6ac8d Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Thu, 17 Oct 2024 14:05:20 +0100 Subject: [PATCH 09/11] Update generate_emoji_keywords.py and created common_arg_parser.py for modularity --- .../emoji_keywords/generate_emoji_keywords.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py index 2749b9084..8bcf81d1a 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py @@ -1,5 +1,5 @@ """ -Generates keyword-emoji relationships from a selection of Hindi words, ensuring Urdu words are excluded. +Generates keyword-emoji relationships from a selection of Hindustani words. .. raw:: html + -->. """ from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword -from scribe_data.unicode.common_arg_parser import setup_arg_parser +from scribe_data.unicode.common_arg_parser import setup_arg_parser + + +LANGUAGE = "Hindustani" -# Define the main language. -LANGUAGE = "Hindi" # Set up the argument parser by calling the imported function. parser = setup_arg_parser() @@ -35,9 +36,9 @@ # Call the generate_emoji_keyword function with optional parameters. generate_emoji_keyword( LANGUAGE, - args.file_path, - emojis_per_keyword=args.emojis_per_keyword, - gender=args.gender, - region=args.region, - sub_languages=args.sub_languages, + args.file_path, + emojis_per_keyword=args.emojis_per_keyword, + gender=args.gender, + region=args.region, + sub_languages=args.sub_languages, ) From f602dc3252017b7673b761d5802c58e63d33db20 Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Wed, 16 Oct 2024 07:54:46 +0100 Subject: [PATCH 10/11] Update generate_emoji_keyword.py with additional changes --- .../Hindi/emoji_keywords/generate_emoji_keywords.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py index 8bcf81d1a..27be9610a 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py @@ -33,7 +33,11 @@ # Parse the command-line arguments. args = parser.parse_args() -# Call the generate_emoji_keyword function with optional parameters. +# Define the main language +LANGUAGE = "Hindustani" # Change to a grouped language if needed + + +# Call the generate_emoji_keyword function with optional parameters generate_emoji_keyword( LANGUAGE, args.file_path, From 84ef9a1ba1d6427a046d5e8bac4e1383d00c397b Mon Sep 17 00:00:00 2001 From: Ekikereabasi Nkereuwem Date: Sun, 20 Oct 2024 00:13:07 +0100 Subject: [PATCH 11/11] Delete Hindustani emoji_keyword.py file --- .../emoji_keywords/generate_emoji_keywords.py | 48 ------------------- 1 file changed, 48 deletions(-) delete mode 100644 src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py deleted file mode 100644 index 27be9610a..000000000 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Generates keyword-emoji relationships from a selection of Hindustani words. - -.. raw:: html - . -""" - -from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword -from scribe_data.unicode.common_arg_parser import setup_arg_parser - - -LANGUAGE = "Hindustani" - - -# Set up the argument parser by calling the imported function. -parser = setup_arg_parser() - -# Parse the command-line arguments. -args = parser.parse_args() - -# Define the main language -LANGUAGE = "Hindustani" # Change to a grouped language if needed - - -# Call the generate_emoji_keyword function with optional parameters -generate_emoji_keyword( - LANGUAGE, - args.file_path, - emojis_per_keyword=args.emojis_per_keyword, - gender=args.gender, - region=args.region, - sub_languages=args.sub_languages, -)