diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py new file mode 100644 index 000000000..68d5f0393 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py @@ -0,0 +1,57 @@ +""" +* Copyright (C) 2024 Scribe +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program. If not, see . +""" + +import argparse +from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword + +# Define the main language +LANGUAGE = "Hindustani" # Change to a grouped language if needed +emojis_per_keyword = 3 + +# Set up the argument parser +parser = argparse.ArgumentParser( + description="Generate emoji keywords for a specific language." +) +parser.add_argument( + "--file-path", required=True, help="Path to save the generated emoji keywords." +) +parser.add_argument( + "--sub-languages", + nargs="*", + help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.", +) +parser.add_argument( + "--gender", + choices=["male", "female", "neutral"], + help="Specify the gender for emoji customization.", +) +parser.add_argument( + "--region", help="Specify the region for emoji customization (e.g., US, IN)." +) + +# Parse the command-line arguments +args = parser.parse_args() + +# Call the generate_emoji_keyword function with optional parameters +generate_emoji_keyword( + LANGUAGE, + emojis_per_keyword, + args.file_path, + gender=args.gender, + region=args.region, + sub_languages=args.sub_languages, +) diff --git a/src/scribe_data/unicode/generate_emoji_keyword.py b/src/scribe_data/unicode/generate_emoji_keyword.py new file mode 100644 index 000000000..df20c2110 --- /dev/null +++ b/src/scribe_data/unicode/generate_emoji_keyword.py @@ -0,0 +1,78 @@ +from .process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data + + +def generate_emoji_keyword( + language, + emojis_per_keyword, + file_path, + gender=None, + region=None, + sub_languages=None, +): + """ + Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu). + + Parameters: + - language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu). + - emojis_per_keyword (int): Number of emojis to associate with each keyword. + - file_path (str): The path to the file where the generated emoji keywords will be saved. + - gender (str): Gender-based customization for emojis (e.g., "male", "female"). + - region (str): Regional customization for emojis (e.g., "US", "JP"). + - sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed. + """ + + # Define grouped languages and their sub-languages + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed + } + + # If the language is a grouped language, handle its sub-languages + if language in grouped_languages: + # If specific sub-languages are provided, only process those + sub_languages_to_process = sub_languages or grouped_languages[language] + + for sub_lang in sub_languages_to_process: + print(f"Processing sub-language: {sub_lang}") + + # Generate emoji keywords for the sub-language + emoji_keywords_dict = gen_emoji_lexicon( + language=sub_lang, + emojis_per_keyword=emojis_per_keyword, + gender=gender, + region=region, + ) + + # Export the generated emoji keywords for the sub-language + if emoji_keywords_dict: + # Save the file with the sub-language included in the file name + export_file_path = f"{file_path}_{sub_lang}.json" + export_formatted_data( + file_path=export_file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=sub_lang, + data_type="emoji-keywords", + ) + + # If it's not a grouped language, process it as a single language + else: + # Generate emoji keywords for the given language + emoji_keywords_dict = gen_emoji_lexicon( + language=language, + emojis_per_keyword=emojis_per_keyword, + gender=gender, + region=region, + ) + + # Export the generated emoji keywords for the language + if emoji_keywords_dict: + export_formatted_data( + file_path=file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language, + data_type="emoji-keywords", + ) diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 223a40fec..319adcf2c 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -50,6 +50,130 @@ def gen_emoji_lexicon( language: str, emojis_per_keyword: int, + gender=None, + region=None, +): + """ + Generate emoji lexicon for a given language with optional gender and region customizations. + + Parameters: + - language (str): The language for which emoji keywords are generated. + - emojis_per_keyword (int): Number of emojis to associate with each keyword. + - gender (str, optional): Gender-based customization for emojis (e.g., "male", "female"). + - region (str, optional): Regional customization for emojis (e.g., "US", "JP"). + + Returns: + - dict: A dictionary containing emoji keywords and associated emojis. + """ + + # Initialize the emoji dictionary + emoji_keywords_dict = {} + + # Define grouped languages and their specific languages + grouped_languages = { + "Hindustani": ["Hindi", "Urdu"], + "Norwegian": ["Bokmål", "Nynorsk"], + # Add more grouped languages as needed + } + + # Function to add emojis based on gender and region + def add_emojis_for_gender_region(lang, gender, region): + """ + This function generates a set of emojis based on the specified language, + gender, and region. It aims to ensure that the emojis are relevant and + culturally appropriate for the given context. + + Parameters: + - lang (str): The language for which emojis are being generated. This could + affect the representation of certain emojis or their usage. + - gender (str): A string that indicates the gender for which emojis should + be selected. Accepted values are "male" and "female". + - region (str): A string representing the geographical region, which can + influence the selection of emojis to include those that are culturally + significant or popular in that area (e.g., "IN" for India). + + Implementation Details: + 1. **Placeholder Logic**: + - The function currently contains placeholder comments indicating where + the actual logic for selecting emojis should be implemented. This allows + contributors to easily identify where to add the necessary emoji-selection + logic based on gender and region. + + 2. **Gender-Based Emoji Selection**: + - The function checks the gender parameter. Depending on whether the + gender is "male" or "female", different sets of emojis should be + included. For example, if the gender is "male", the logic for selecting + male-specific emojis will be executed. Similarly, for "female", + female-specific emojis should be considered. + - The `pass` statement is a placeholder for the logic that should be + implemented later. This could involve referencing a predefined list of + emojis or generating emojis based on specific criteria related to gender. + + 3. **Region-Based Emoji Selection**: + - The function also checks the region parameter. If the region is "IN", + the logic for selecting emojis that are relevant to India will be executed. + - Just like with gender, the `pass` statement indicates where to add + this logic. The selected emojis should reflect cultural significance or + popular usage in the specified region. + + 4. **Returning Emoji Data**: + - The function is designed to return a dictionary containing the emojis that + have been selected based on the provided parameters. The current implementation + returns an empty dictionary, which should be replaced with the actual logic to + populate it with emoji data generated from the gender and region logic. + + Need for Modularity: + - As the project scales and the emoji selection logic becomes more complex, + it is essential to keep the code modular. This means separating different + functionalities into distinct modules or files. + + + This function serves as a foundational component for generating + emojis tailored to specific user demographics, and implementing it in a + modular fashion will support future enhancements and maintenance. + """ + + if gender == "male": + # Include male-specific emojis logic + pass + elif gender == "female": + # Include female-specific emojis logic + pass + + if region == "IN": + # Include region-specific emojis logic + pass + + # Return any generated emoji data for the given language + return {} + + # Check if the language is a grouped language + for grouped_language, sub_languages in grouped_languages.items(): + if language == grouped_language: + # Process each sub-language in the grouped language + for sub_lang in sub_languages: + print(f"Processing sub-language: {sub_lang}") + # Add emojis for each sub-language based on gender and region + emojis = add_emojis_for_gender_region(sub_lang, gender, region) + emoji_keywords_dict[sub_lang] = emojis # Add to the dictionary + + # If you want to combine results for the grouped language + emoji_keywords_dict[grouped_language] = emoji_keywords_dict + + return emoji_keywords_dict # Return the dict for grouped languages + + # If it's not a grouped language, handle it as a single language + else: + # Generate emojis for the given single language + emojis = add_emojis_for_gender_region(language, gender, region) + emoji_keywords_dict[language] = emojis + + return emoji_keywords_dict + + +def gen_emoji_lexicon_old( + language: str, + emojis_per_keyword: int, ): """ Generates a dictionary of keywords (keys) and emoji unicode(s) associated with them (values).