diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 3cbea6980..7bf54453b 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -23,6 +23,7 @@ import subprocess from pathlib import Path +from scribe_data.unicode.generate_emoji_keywords import generate_emoji from scribe_data.utils import ( DEFAULT_CSV_EXPORT_DIR, DEFAULT_JSON_EXPORT_DIR, @@ -102,18 +103,7 @@ def get_data( # MARK: Emojis elif data_type in {"emoji-keywords", "emoji_keywords"}: - for lang in languages: - emoji_keyword_extraction_script = ( - Path(__file__).parent.parent - / "language_data_extraction" - / lang - / "emoji_keywords" - / "generate_emoji_keywords.py" - ) - - subprocess_result = subprocess.run( - ["python", emoji_keyword_extraction_script] - ) + generate_emoji(language=language, output_dir=output_dir) # MARK: Query Data diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 506bbcdd1..1a4c991bc 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -202,6 +202,9 @@ def main() -> None: args = parser.parse_args() + if args.data_type and isinstance(args.data_type, str): + args.data_type = args.data_type.replace("-", "_") + try: if args.language or args.data_type: validate_language_and_data_type( diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py new file mode 100644 index 000000000..756f06b31 --- /dev/null +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -0,0 +1,59 @@ +""" +Centralized keyword-emoji generation file to generated emoji for a specified Language. + +.. raw:: html + +""" + +import os +from pathlib import Path + +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data, get_language_iso + +DATA_TYPE = "emoji-keywords" +EMOJI_KEYWORDS_DICT = 3 + + +def generate_emoji(language, output_dir: str = None): + iso = get_language_iso(language=language) + path_to_cldr_annotations = ( + Path(__file__).parent / "cldr-annotations-full" / "annotations" + ) + if iso in os.listdir(path_to_cldr_annotations): + print(f"Emoji Generation for language {language} is supported") + + else: + print(f"Emoji Generation for language {language} is not supported") + return + + updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir + export_dir = Path(updated_path) / language.capitalize() + export_dir.mkdir(parents=True, exist_ok=True) + + if emoji_keywords_dict := gen_emoji_lexicon( + language=language, + emojis_per_keyword=EMOJI_KEYWORDS_DICT, + ): + export_formatted_data( + file_path=output_dir, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language, + data_type=DATA_TYPE, + ) diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 21f550e5f..a3f39625b 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -76,7 +76,9 @@ def gen_emoji_lexicon( # Pre-set up the emoji popularity data. popularity_dict = {} - with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file: + with (Path(__file__).parent / "2021_ranked.tsv").open( + encoding="utf-8" + ) as popularity_file: tsv_reader = csv.DictReader(popularity_file, delimiter="\t") for tsv_row in tsv_reader: popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"]) @@ -93,6 +95,7 @@ def gen_emoji_lexicon( / f"{iso}" / "annotations.json" ) + annotations_derived_file_path = ( Path(__file__).parent / "cldr-annotations-derived-full" @@ -107,7 +110,7 @@ def gen_emoji_lexicon( } for cldr_file_key, cldr_file_path in cldr_file_paths.items(): - with open(cldr_file_path, "r") as file: + with open(cldr_file_path, "r", encoding="utf-8") as file: cldr_data = json.load(file) cldr_dict = cldr_data[cldr_file_key]["annotations"] diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 686f62843..a1e21e750 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -29,10 +29,15 @@ class TestGetData(unittest.TestCase): # MARK: Subprocess Patching - @patch("subprocess.run") - def test_get_emoji_keywords(self, mock_subprocess_run): - get_data(language="English", data_type="emoji-keywords") - self.assertTrue(mock_subprocess_run.called) + @patch("scribe_data.cli.get.generate_emoji") + def test_get_emoji_keywords(self, generate_emoji): + get_data( + language="English", data_type="emoji_keywords", output_dir="./test_output" + ) + generate_emoji.assert_called_once_with( + language="English", + output_dir="./test_output", + ) # MARK: Invalid Arguments