Skip to content

Commit

Permalink
Update generate_emoji_keyword.py with additional changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Ekikereabasi-Nk committed Oct 17, 2024
1 parent dc2dbac commit 08a6986
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
"""

import argparse
from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword

# Define the main language
LANGUAGE = "Hindustani" # Change to a grouped language if needed
emojis_per_keyword = 3

# Set up the argument parser
parser = argparse.ArgumentParser(
description="Generate emoji keywords for a specific language."
)
parser.add_argument(
"--file-path", required=True, help="Path to save the generated emoji keywords."
)
parser.add_argument(
"--sub-languages",
nargs="*",
help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.",
)
parser.add_argument(
"--gender",
choices=["male", "female", "neutral"],
help="Specify the gender for emoji customization.",
)
parser.add_argument(
"--region", help="Specify the region for emoji customization (e.g., US, IN)."
)

# Parse the command-line arguments
args = parser.parse_args()

# Call the generate_emoji_keyword function with optional parameters
generate_emoji_keyword(
LANGUAGE,
emojis_per_keyword,
args.file_path,
gender=args.gender,
region=args.region,
sub_languages=args.sub_languages,
)
78 changes: 78 additions & 0 deletions src/scribe_data/unicode/generate_emoji_keyword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from .process_unicode import gen_emoji_lexicon
from scribe_data.utils import export_formatted_data


def generate_emoji_keyword(
language,
emojis_per_keyword,
file_path,
gender=None,
region=None,
sub_languages=None,
):
"""
Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu).
Parameters:
- language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu).
- emojis_per_keyword (int): Number of emojis to associate with each keyword.
- file_path (str): The path to the file where the generated emoji keywords will be saved.
- gender (str): Gender-based customization for emojis (e.g., "male", "female").
- region (str): Regional customization for emojis (e.g., "US", "JP").
- sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed.
"""

# Define grouped languages and their sub-languages
grouped_languages = {
"Hindustani": ["Hindi", "Urdu"],
"Norwegian": ["Bokmål", "Nynorsk"],
# Add more grouped languages as needed
}

# If the language is a grouped language, handle its sub-languages
if language in grouped_languages:
# If specific sub-languages are provided, only process those
sub_languages_to_process = sub_languages or grouped_languages[language]

for sub_lang in sub_languages_to_process:
print(f"Processing sub-language: {sub_lang}")

# Generate emoji keywords for the sub-language
emoji_keywords_dict = gen_emoji_lexicon(
language=sub_lang,
emojis_per_keyword=emojis_per_keyword,
gender=gender,
region=region,
)

# Export the generated emoji keywords for the sub-language
if emoji_keywords_dict:
# Save the file with the sub-language included in the file name
export_file_path = f"{file_path}_{sub_lang}.json"
export_formatted_data(
file_path=export_file_path,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=sub_lang,
data_type="emoji-keywords",
)

# If it's not a grouped language, process it as a single language
else:
# Generate emoji keywords for the given language
emoji_keywords_dict = gen_emoji_lexicon(
language=language,
emojis_per_keyword=emojis_per_keyword,
gender=gender,
region=region,
)

# Export the generated emoji keywords for the language
if emoji_keywords_dict:
export_formatted_data(
file_path=file_path,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language,
data_type="emoji-keywords",
)
124 changes: 124 additions & 0 deletions src/scribe_data/unicode/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,130 @@
def gen_emoji_lexicon(
language: str,
emojis_per_keyword: int,
gender=None,
region=None,
):
"""
Generate emoji lexicon for a given language with optional gender and region customizations.
Parameters:
- language (str): The language for which emoji keywords are generated.
- emojis_per_keyword (int): Number of emojis to associate with each keyword.
- gender (str, optional): Gender-based customization for emojis (e.g., "male", "female").
- region (str, optional): Regional customization for emojis (e.g., "US", "JP").
Returns:
- dict: A dictionary containing emoji keywords and associated emojis.
"""

# Initialize the emoji dictionary
emoji_keywords_dict = {}

# Define grouped languages and their specific languages
grouped_languages = {
"Hindustani": ["Hindi", "Urdu"],
"Norwegian": ["Bokmål", "Nynorsk"],
# Add more grouped languages as needed
}

# Function to add emojis based on gender and region
def add_emojis_for_gender_region(lang, gender, region):
"""
This function generates a set of emojis based on the specified language,
gender, and region. It aims to ensure that the emojis are relevant and
culturally appropriate for the given context.
Parameters:
- lang (str): The language for which emojis are being generated. This could
affect the representation of certain emojis or their usage.
- gender (str): A string that indicates the gender for which emojis should
be selected. Accepted values are "male" and "female".
- region (str): A string representing the geographical region, which can
influence the selection of emojis to include those that are culturally
significant or popular in that area (e.g., "IN" for India).
Implementation Details:
1. **Placeholder Logic**:
- The function currently contains placeholder comments indicating where
the actual logic for selecting emojis should be implemented. This allows
contributors to easily identify where to add the necessary emoji-selection
logic based on gender and region.
2. **Gender-Based Emoji Selection**:
- The function checks the gender parameter. Depending on whether the
gender is "male" or "female", different sets of emojis should be
included. For example, if the gender is "male", the logic for selecting
male-specific emojis will be executed. Similarly, for "female",
female-specific emojis should be considered.
- The `pass` statement is a placeholder for the logic that should be
implemented later. This could involve referencing a predefined list of
emojis or generating emojis based on specific criteria related to gender.
3. **Region-Based Emoji Selection**:
- The function also checks the region parameter. If the region is "IN",
the logic for selecting emojis that are relevant to India will be executed.
- Just like with gender, the `pass` statement indicates where to add
this logic. The selected emojis should reflect cultural significance or
popular usage in the specified region.
4. **Returning Emoji Data**:
- The function is designed to return a dictionary containing the emojis that
have been selected based on the provided parameters. The current implementation
returns an empty dictionary, which should be replaced with the actual logic to
populate it with emoji data generated from the gender and region logic.
Need for Modularity:
- As the project scales and the emoji selection logic becomes more complex,
it is essential to keep the code modular. This means separating different
functionalities into distinct modules or files.
This function serves as a foundational component for generating
emojis tailored to specific user demographics, and implementing it in a
modular fashion will support future enhancements and maintenance.
"""

if gender == "male":
# Include male-specific emojis logic
pass
elif gender == "female":
# Include female-specific emojis logic
pass

if region == "IN":
# Include region-specific emojis logic
pass

# Return any generated emoji data for the given language
return {}

# Check if the language is a grouped language
for grouped_language, sub_languages in grouped_languages.items():
if language == grouped_language:
# Process each sub-language in the grouped language
for sub_lang in sub_languages:
print(f"Processing sub-language: {sub_lang}")
# Add emojis for each sub-language based on gender and region
emojis = add_emojis_for_gender_region(sub_lang, gender, region)
emoji_keywords_dict[sub_lang] = emojis # Add to the dictionary

# If you want to combine results for the grouped language
emoji_keywords_dict[grouped_language] = emoji_keywords_dict

return emoji_keywords_dict # Return the dict for grouped languages

# If it's not a grouped language, handle it as a single language
else:
# Generate emojis for the given single language
emojis = add_emojis_for_gender_region(language, gender, region)
emoji_keywords_dict[language] = emojis

return emoji_keywords_dict


def gen_emoji_lexicon_old(
language: str,
emojis_per_keyword: int,
):
"""
Generates a dictionary of keywords (keys) and emoji unicode(s) associated with them (values).
Expand Down

0 comments on commit 08a6986

Please sign in to comment.