Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created and Added all the languages that support Emoji #440

Merged
merged 9 commits into from
Oct 24, 2024
14 changes: 2 additions & 12 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import subprocess
from pathlib import Path

from scribe_data.unicode.generate_emoji_keywords import generate_emoji
from scribe_data.utils import (
DEFAULT_CSV_EXPORT_DIR,
DEFAULT_JSON_EXPORT_DIR,
Expand Down Expand Up @@ -102,18 +103,7 @@ def get_data(
# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
emoji_keyword_extraction_script = (
Path(__file__).parent.parent
/ "language_data_extraction"
/ lang
/ "emoji_keywords"
/ "generate_emoji_keywords.py"
)

subprocess_result = subprocess.run(
["python", emoji_keyword_extraction_script]
)
generate_emoji(language=language, output_dir=output_dir)

# MARK: Query Data

Expand Down
3 changes: 3 additions & 0 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ def main() -> None:

args = parser.parse_args()

if args.data_type and isinstance(args.data_type, str):
args.data_type = args.data_type.replace("-", "_")

try:
if args.language or args.data_type:
validate_language_and_data_type(
Expand Down
59 changes: 59 additions & 0 deletions src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Centralized keyword-emoji generation file to generated emoji for a specified Language.

.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import os
from pathlib import Path

from scribe_data.unicode.process_unicode import gen_emoji_lexicon
from scribe_data.utils import export_formatted_data, get_language_iso

DATA_TYPE = "emoji-keywords"
EMOJI_KEYWORDS_DICT = 3


def generate_emoji(language, output_dir: str = None):
iso = get_language_iso(language=language)
path_to_cldr_annotations = (
Path(__file__).parent / "cldr-annotations-full" / "annotations"
)
if iso in os.listdir(path_to_cldr_annotations):
print(f"Emoji Generation for language {language} is supported")

else:
print(f"Emoji Generation for language {language} is not supported")
return

updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
export_dir = Path(updated_path) / language.capitalize()
export_dir.mkdir(parents=True, exist_ok=True)

if emoji_keywords_dict := gen_emoji_lexicon(
language=language,
emojis_per_keyword=EMOJI_KEYWORDS_DICT,
):
export_formatted_data(
file_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language,
Copy link
Collaborator

@axif0 axif0 Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use language.capitalize(), therefore no No such file or directory: 'scribe_data_json_export/english/emoji_keywords.json' will cause. In file data-type should be '{user_given_directory}/English/emoji_keywords.json'

Then we can call scribe-data get -lang English -dt emoji_keywords -od ./output_data and get the file in output_data

data_type=DATA_TYPE,
)
7 changes: 5 additions & 2 deletions src/scribe_data/unicode/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def gen_emoji_lexicon(
# Pre-set up the emoji popularity data.
popularity_dict = {}

with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
with (Path(__file__).parent / "2021_ranked.tsv").open(
encoding="utf-8"
) as popularity_file:
tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
for tsv_row in tsv_reader:
popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
Expand All @@ -93,6 +95,7 @@ def gen_emoji_lexicon(
/ f"{iso}"
/ "annotations.json"
)

annotations_derived_file_path = (
Path(__file__).parent
/ "cldr-annotations-derived-full"
Expand All @@ -107,7 +110,7 @@ def gen_emoji_lexicon(
}

for cldr_file_key, cldr_file_path in cldr_file_paths.items():
with open(cldr_file_path, "r") as file:
with open(cldr_file_path, "r", encoding="utf-8") as file:
cldr_data = json.load(file)

cldr_dict = cldr_data[cldr_file_key]["annotations"]
Expand Down
13 changes: 9 additions & 4 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,15 @@
class TestGetData(unittest.TestCase):
# MARK: Subprocess Patching

@patch("subprocess.run")
def test_get_emoji_keywords(self, mock_subprocess_run):
get_data(language="English", data_type="emoji-keywords")
self.assertTrue(mock_subprocess_run.called)
@patch("scribe_data.cli.get.generate_emoji")
def test_get_emoji_keywords(self, generate_emoji):
get_data(
language="English", data_type="emoji_keywords", output_dir="./test_output"
)
generate_emoji.assert_called_once_with(
language="English",
output_dir="./test_output",
)

# MARK: Invalid Arguments

Expand Down
Loading