-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update generate_emoji_keyword.py with additional changes
- Loading branch information
1 parent
dc2dbac
commit 08a6986
Showing
3 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
57 changes: 57 additions & 0 deletions
57
..._data/language_data_extraction/Hindustani/Hindi/emoji_keywords/generate_emoji_keywords.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
""" | ||
* Copyright (C) 2024 Scribe | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation, either version 3 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
""" | ||
|
||
import argparse | ||
from scribe_data.unicode.generate_emoji_keyword import generate_emoji_keyword | ||
|
||
# Define the main language | ||
LANGUAGE = "Hindustani" # Change to a grouped language if needed | ||
emojis_per_keyword = 3 | ||
|
||
# Set up the argument parser | ||
parser = argparse.ArgumentParser( | ||
description="Generate emoji keywords for a specific language." | ||
) | ||
parser.add_argument( | ||
"--file-path", required=True, help="Path to save the generated emoji keywords." | ||
) | ||
parser.add_argument( | ||
"--sub-languages", | ||
nargs="*", | ||
help="List of specific sub-languages to process (e.g., Hindi Urdu). If omitted, all sub-languages will be processed.", | ||
) | ||
parser.add_argument( | ||
"--gender", | ||
choices=["male", "female", "neutral"], | ||
help="Specify the gender for emoji customization.", | ||
) | ||
parser.add_argument( | ||
"--region", help="Specify the region for emoji customization (e.g., US, IN)." | ||
) | ||
|
||
# Parse the command-line arguments | ||
args = parser.parse_args() | ||
|
||
# Call the generate_emoji_keyword function with optional parameters | ||
generate_emoji_keyword( | ||
LANGUAGE, | ||
emojis_per_keyword, | ||
args.file_path, | ||
gender=args.gender, | ||
region=args.region, | ||
sub_languages=args.sub_languages, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from .process_unicode import gen_emoji_lexicon | ||
from scribe_data.utils import export_formatted_data | ||
|
||
|
||
def generate_emoji_keyword( | ||
language, | ||
emojis_per_keyword, | ||
file_path, | ||
gender=None, | ||
region=None, | ||
sub_languages=None, | ||
): | ||
""" | ||
Generate emoji keywords for a specified language, with optional support for grouped languages (e.g., Hindustani = Hindi + Urdu). | ||
Parameters: | ||
- language (str): The language or grouped language for which emoji keywords are generated (e.g., "Hindustani" for Hindi and Urdu). | ||
- emojis_per_keyword (int): Number of emojis to associate with each keyword. | ||
- file_path (str): The path to the file where the generated emoji keywords will be saved. | ||
- gender (str): Gender-based customization for emojis (e.g., "male", "female"). | ||
- region (str): Regional customization for emojis (e.g., "US", "JP"). | ||
- sub_languages (list): A list of specific sub-languages for grouped languages (e.g., ["Hindi", "Urdu"]). If not provided, all sub-languages in the group will be processed. | ||
""" | ||
|
||
# Define grouped languages and their sub-languages | ||
grouped_languages = { | ||
"Hindustani": ["Hindi", "Urdu"], | ||
"Norwegian": ["Bokmål", "Nynorsk"], | ||
# Add more grouped languages as needed | ||
} | ||
|
||
# If the language is a grouped language, handle its sub-languages | ||
if language in grouped_languages: | ||
# If specific sub-languages are provided, only process those | ||
sub_languages_to_process = sub_languages or grouped_languages[language] | ||
|
||
for sub_lang in sub_languages_to_process: | ||
print(f"Processing sub-language: {sub_lang}") | ||
|
||
# Generate emoji keywords for the sub-language | ||
emoji_keywords_dict = gen_emoji_lexicon( | ||
language=sub_lang, | ||
emojis_per_keyword=emojis_per_keyword, | ||
gender=gender, | ||
region=region, | ||
) | ||
|
||
# Export the generated emoji keywords for the sub-language | ||
if emoji_keywords_dict: | ||
# Save the file with the sub-language included in the file name | ||
export_file_path = f"{file_path}_{sub_lang}.json" | ||
export_formatted_data( | ||
file_path=export_file_path, | ||
formatted_data=emoji_keywords_dict, | ||
query_data_in_use=True, | ||
language=sub_lang, | ||
data_type="emoji-keywords", | ||
) | ||
|
||
# If it's not a grouped language, process it as a single language | ||
else: | ||
# Generate emoji keywords for the given language | ||
emoji_keywords_dict = gen_emoji_lexicon( | ||
language=language, | ||
emojis_per_keyword=emojis_per_keyword, | ||
gender=gender, | ||
region=region, | ||
) | ||
|
||
# Export the generated emoji keywords for the language | ||
if emoji_keywords_dict: | ||
export_formatted_data( | ||
file_path=file_path, | ||
formatted_data=emoji_keywords_dict, | ||
query_data_in_use=True, | ||
language=language, | ||
data_type="emoji-keywords", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters