From b9d9b02c38c1375c3357d10e60113415c34cc6fd Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 17:54:38 +0900 Subject: [PATCH 1/7] add `POSTag::w_emoji` & `Match::emoji` --- include/kiwi/PatternMatcher.h | 3 +- include/kiwi/ScriptType.h | 7 +- include/kiwi/Types.h | 4 +- include/kiwi/Utils.h | 4 +- src/Kiwi.cpp | 2 +- src/PatternMatcher.cpp | 75 +++++++ src/ScriptType.cpp | 362 +++++++++++++++++----------------- src/StrUtils.h | 1 + src/Utils.cpp | 4 +- 9 files changed, 272 insertions(+), 190 deletions(-) diff --git a/include/kiwi/PatternMatcher.h b/include/kiwi/PatternMatcher.h index 81a513f9..c153fe89 100644 --- a/include/kiwi/PatternMatcher.h +++ b/include/kiwi/PatternMatcher.h @@ -15,6 +15,7 @@ namespace kiwi hashtag = 1 << 2, /**< 해시태그 형태의 텍스트(#해시)를 w_hashtag 태그에 매칭한다 */ mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */ serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */ + emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */ normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */ joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */ joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */ @@ -25,7 +26,7 @@ namespace kiwi zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */ joinVSuffix = joinVerbSuffix | joinAdjSuffix, joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix, - all = url | email | hashtag | mention | serial | zCoda, + all = url | email | hashtag | mention | serial | emoji | zCoda, allWithNormalizing = all | normalizeCoda, }; diff --git a/include/kiwi/ScriptType.h b/include/kiwi/ScriptType.h index 5e5edf28..e6fdbcc7 100644 --- a/include/kiwi/ScriptType.h +++ b/include/kiwi/ScriptType.h @@ -241,5 +241,10 @@ namespace kiwi const char* getScriptName(ScriptType type); - bool isEmoji(char32_t c0, char32_t c1 = 0); + /** + * @brief Check if the character is an emoji + * + * @return 0 if the character is not an emoji, 1 if c0 is an emoji, 2 if c0 and c1 are combined to form an emoji. + */ + size_t isEmoji(char32_t c0, char32_t c1 = 0); } diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index b3ab7412..14e2f4c1 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -1,4 +1,4 @@ -/** +/** * @file Types.h * @author bab2min (bab2min@gmail.com) * @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일 @@ -202,7 +202,7 @@ namespace kiwi vcp, vcn, sf, sp, ss, sso, ssc, se, so, sw, sb, sl, sh, sn, - w_url, w_email, w_mention, w_hashtag, w_serial, + w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji, jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc, ep, ef, ec, etn, etm, z_coda, diff --git a/include/kiwi/Utils.h b/include/kiwi/Utils.h index 4186a9a8..870d8a9e 100644 --- a/include/kiwi/Utils.h +++ b/include/kiwi/Utils.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include #include #include @@ -30,7 +30,7 @@ namespace kiwi inline bool isWebTag(POSTag t) { - return POSTag::w_url <= t && t <= POSTag::w_hashtag; + return POSTag::w_url <= t && t <= POSTag::w_emoji; } POSTag toPOSTag(const std::u16string& tagStr); diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 6a8428bb..cc0e3649 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -600,7 +600,7 @@ namespace kiwi inline void updateTokenInfoScript(TokenInfo& info) { - if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return; + if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw || info.tag == POSTag::w_emoji)) return; if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return; if (info.str.empty()) return; char32_t c = info.str[0]; diff --git a/src/PatternMatcher.cpp b/src/PatternMatcher.cpp index 6f59d9fd..ac1e5b39 100644 --- a/src/PatternMatcher.cpp +++ b/src/PatternMatcher.cpp @@ -1,6 +1,8 @@ #include #include +#include #include "pattern.hpp" +#include "StrUtils.h" using namespace std; using namespace kiwi; @@ -26,6 +28,7 @@ namespace kiwi size_t testNumeric(const char16_t left, const char16_t* first, const char16_t* last) const; size_t testSerial(const char16_t* first, const char16_t* last) const; size_t testAbbr(const char16_t* first, const char16_t* last) const; + size_t testEmoji(const char16_t* first, const char16_t* last) const; public: std::pair match(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions) const; @@ -290,6 +293,77 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last) return b - first; } +size_t PatternMatcherImpl::testEmoji(const char16_t* first, const char16_t* last) const +{ + const char16_t* b = first; + while (b + 1 < last) + { + char32_t c0 = 0, c1 = 0; + const char16_t* b1 = b; + if (isHighSurrogate(*b1)) + { + c0 = mergeSurrogate(b1[0], b1[1]); + b1 += 2; + } + else + { + c0 = *b1++; + } + + const char16_t* b2 = b1; + if (b2 < last) + { + if (isHighSurrogate(*b2) && b2 + 1 < last) + { + c1 = mergeSurrogate(b2[0], b2[1]); + b2 += 2; + } + else + { + c1 = *b2++; + } + } + + auto r = isEmoji(c0, c1); + if (r == 1) + { + b = b1; + } + else if (r == 2) + { + b = b2; + } + else + { + break; + } + + if (b == last) return b - first; + if (0xfe00 <= *b && *b <= 0xfe0f) // variation selectors + { + ++b; + if (b == last) return b - first; + } + else if (b + 1 < last && isHighSurrogate(b[0])) + { + c1 = mergeSurrogate(b[0], b[1]); + if (0x1f3fb <= c1 && c1 <= 0x1f3ff) // skin color modifier + { + b += 2; + if (b == last) return b - first; + } + } + + if (*b == 0x200d) // zero width joiner + { + ++b; + continue; + } + break; + } + return b - first; +} + pair PatternMatcherImpl::match(char16_t left, const char16_t * first, const char16_t * last, Match matchOptions) const { size_t size; @@ -299,6 +373,7 @@ pair PatternMatcherImpl::match(char16_t left, const char16_t * f if (!!(matchOptions & Match::email) && (size = testEmail(first, last))) return make_pair(size, POSTag::w_email); if (!!(matchOptions & Match::mention) && (size = testMention(first, last))) return make_pair(size, POSTag::w_mention); if (!!(matchOptions & Match::url) && (size = testUrl(first, last))) return make_pair(size, POSTag::w_url); + if (!!(matchOptions & Match::emoji) && (size = testEmoji(first, last))) return make_pair(size, POSTag::w_emoji); if ((size = testAbbr(first, last))) return make_pair(size, POSTag::sl); return make_pair(0, POSTag::unknown); } diff --git a/src/ScriptType.cpp b/src/ScriptType.cpp index 4ed906be..b1e6db5d 100644 --- a/src/ScriptType.cpp +++ b/src/ScriptType.cpp @@ -567,188 +567,188 @@ namespace kiwi return "unknown"; } - bool isEmoji(char32_t c0, char32_t c1) + size_t isEmoji(char32_t c0, char32_t c1) { - if (0x1f004 == c0) return true; - if (0x1f0cf == c0) return true; - if (0x1f18e == c0) return true; - if (0x1f191 <= c0 && c0 <= 0x1f19a) return true; - if (0x1f201 == c0) return true; - if (0x1f21a == c0) return true; - if (0x1f22f == c0) return true; - if (0x1f232 <= c0 && c0 <= 0x1f236) return true; - if (0x1f238 <= c0 && c0 <= 0x1f23a) return true; - if (0x1f250 <= c0 && c0 <= 0x1f251) return true; - if (0x1f300 <= c0 && c0 <= 0x1f320) return true; - if (0x1f32d <= c0 && c0 <= 0x1f335) return true; - if (0x1f337 <= c0 && c0 <= 0x1f37c) return true; - if (0x1f37e <= c0 && c0 <= 0x1f393) return true; - if (0x1f3a0 <= c0 && c0 <= 0x1f3ca) return true; - if (0x1f3cf <= c0 && c0 <= 0x1f3d3) return true; - if (0x1f3e0 <= c0 && c0 <= 0x1f3f0) return true; - if (0x1f3f4 == c0) return true; - if (0x1f3f8 <= c0 && c0 <= 0x1f43e) return true; - if (0x1f440 == c0) return true; - if (0x1f442 <= c0 && c0 <= 0x1f4fc) return true; - if (0x1f4ff <= c0 && c0 <= 0x1f53d) return true; - if (0x1f54b <= c0 && c0 <= 0x1f54e) return true; - if (0x1f550 <= c0 && c0 <= 0x1f567) return true; - if (0x1f57a == c0) return true; - if (0x1f595 <= c0 && c0 <= 0x1f596) return true; - if (0x1f5a4 == c0) return true; - if (0x1f5fb <= c0 && c0 <= 0x1f64f) return true; - if (0x1f680 <= c0 && c0 <= 0x1f6c5) return true; - if (0x1f6cc == c0) return true; - if (0x1f6d0 <= c0 && c0 <= 0x1f6d2) return true; - if (0x1f6d5 <= c0 && c0 <= 0x1f6d7) return true; - if (0x1f6dc <= c0 && c0 <= 0x1f6df) return true; - if (0x1f6eb <= c0 && c0 <= 0x1f6ec) return true; - if (0x1f6f4 <= c0 && c0 <= 0x1f6fc) return true; - if (0x1f7e0 <= c0 && c0 <= 0x1f7eb) return true; - if (0x1f7f0 == c0) return true; - if (0x1f90c <= c0 && c0 <= 0x1f93a) return true; - if (0x1f93c <= c0 && c0 <= 0x1f945) return true; - if (0x1f947 <= c0 && c0 <= 0x1f9ff) return true; - if (0x1fa70 <= c0 && c0 <= 0x1fa7c) return true; - if (0x1fa80 <= c0 && c0 <= 0x1fa88) return true; - if (0x1fa90 <= c0 && c0 <= 0x1fabd) return true; - if (0x1fabf <= c0 && c0 <= 0x1fac5) return true; - if (0x1face <= c0 && c0 <= 0x1fadb) return true; - if (0x1fae0 <= c0 && c0 <= 0x1fae8) return true; - if (0x1faf0 <= c0 && c0 <= 0x1faf8) return true; + if (0x1f004 == c0) return 1; + if (0x1f0cf == c0) return 1; + if (0x1f18e == c0) return 1; + if (0x1f191 <= c0 && c0 <= 0x1f19a) return 1; + if (0x1f201 == c0) return 1; + if (0x1f21a == c0) return 1; + if (0x1f22f == c0) return 1; + if (0x1f232 <= c0 && c0 <= 0x1f236) return 1; + if (0x1f238 <= c0 && c0 <= 0x1f23a) return 1; + if (0x1f250 <= c0 && c0 <= 0x1f251) return 1; + if (0x1f300 <= c0 && c0 <= 0x1f320) return 1; + if (0x1f32d <= c0 && c0 <= 0x1f335) return 1; + if (0x1f337 <= c0 && c0 <= 0x1f37c) return 1; + if (0x1f37e <= c0 && c0 <= 0x1f393) return 1; + if (0x1f3a0 <= c0 && c0 <= 0x1f3ca) return 1; + if (0x1f3cf <= c0 && c0 <= 0x1f3d3) return 1; + if (0x1f3e0 <= c0 && c0 <= 0x1f3f0) return 1; + if (0x1f3f4 == c0) return 1; + if (0x1f3f8 <= c0 && c0 <= 0x1f43e) return 1; + if (0x1f440 == c0) return 1; + if (0x1f442 <= c0 && c0 <= 0x1f4fc) return 1; + if (0x1f4ff <= c0 && c0 <= 0x1f53d) return 1; + if (0x1f54b <= c0 && c0 <= 0x1f54e) return 1; + if (0x1f550 <= c0 && c0 <= 0x1f567) return 1; + if (0x1f57a == c0) return 1; + if (0x1f595 <= c0 && c0 <= 0x1f596) return 1; + if (0x1f5a4 == c0) return 1; + if (0x1f5fb <= c0 && c0 <= 0x1f64f) return 1; + if (0x1f680 <= c0 && c0 <= 0x1f6c5) return 1; + if (0x1f6cc == c0) return 1; + if (0x1f6d0 <= c0 && c0 <= 0x1f6d2) return 1; + if (0x1f6d5 <= c0 && c0 <= 0x1f6d7) return 1; + if (0x1f6dc <= c0 && c0 <= 0x1f6df) return 1; + if (0x1f6eb <= c0 && c0 <= 0x1f6ec) return 1; + if (0x1f6f4 <= c0 && c0 <= 0x1f6fc) return 1; + if (0x1f7e0 <= c0 && c0 <= 0x1f7eb) return 1; + if (0x1f7f0 == c0) return 1; + if (0x1f90c <= c0 && c0 <= 0x1f93a) return 1; + if (0x1f93c <= c0 && c0 <= 0x1f945) return 1; + if (0x1f947 <= c0 && c0 <= 0x1f9ff) return 1; + if (0x1fa70 <= c0 && c0 <= 0x1fa7c) return 1; + if (0x1fa80 <= c0 && c0 <= 0x1fa88) return 1; + if (0x1fa90 <= c0 && c0 <= 0x1fabd) return 1; + if (0x1fabf <= c0 && c0 <= 0x1fac5) return 1; + if (0x1face <= c0 && c0 <= 0x1fadb) return 1; + if (0x1fae0 <= c0 && c0 <= 0x1fae8) return 1; + if (0x1faf0 <= c0 && c0 <= 0x1faf8) return 1; - if (c1 != 0xfe0f) return false; - if (0xa9 == c0) return true; - if (0xae == c0) return true; - if (0x203c == c0) return true; - if (0x2049 == c0) return true; - if (0x2122 == c0) return true; - if (0x2139 == c0) return true; - if (0x2194 <= c0 && c0 <= 0x2199) return true; - if (0x21a9 <= c0 && c0 <= 0x21aa) return true; - if (0x231a <= c0 && c0 <= 0x231b) return true; - if (0x2328 == c0) return true; - if (0x23cf == c0) return true; - if (0x23e9 <= c0 && c0 <= 0x23f3) return true; - if (0x23f8 <= c0 && c0 <= 0x23fa) return true; - if (0x24c2 == c0) return true; - if (0x25aa <= c0 && c0 <= 0x25ab) return true; - if (0x25b6 == c0) return true; - if (0x25c0 == c0) return true; - if (0x25fb <= c0 && c0 <= 0x25fe) return true; - if (0x2600 <= c0 && c0 <= 0x2604) return true; - if (0x260e == c0) return true; - if (0x2611 == c0) return true; - if (0x2614 <= c0 && c0 <= 0x2615) return true; - if (0x2618 == c0) return true; - if (0x261d == c0) return true; - if (0x2620 == c0) return true; - if (0x2622 <= c0 && c0 <= 0x2623) return true; - if (0x2626 == c0) return true; - if (0x262a == c0) return true; - if (0x262e <= c0 && c0 <= 0x262f) return true; - if (0x2638 <= c0 && c0 <= 0x263a) return true; - if (0x2640 == c0) return true; - if (0x2642 == c0) return true; - if (0x2648 <= c0 && c0 <= 0x2653) return true; - if (0x265f <= c0 && c0 <= 0x2660) return true; - if (0x2663 == c0) return true; - if (0x2665 <= c0 && c0 <= 0x2666) return true; - if (0x2668 == c0) return true; - if (0x267b == c0) return true; - if (0x267e <= c0 && c0 <= 0x267f) return true; - if (0x2692 <= c0 && c0 <= 0x2697) return true; - if (0x2699 == c0) return true; - if (0x269b <= c0 && c0 <= 0x269c) return true; - if (0x26a0 <= c0 && c0 <= 0x26a1) return true; - if (0x26a7 == c0) return true; - if (0x26aa <= c0 && c0 <= 0x26ab) return true; - if (0x26b0 <= c0 && c0 <= 0x26b1) return true; - if (0x26bd <= c0 && c0 <= 0x26be) return true; - if (0x26c4 <= c0 && c0 <= 0x26c5) return true; - if (0x26c8 == c0) return true; - if (0x26ce <= c0 && c0 <= 0x26cf) return true; - if (0x26d1 == c0) return true; - if (0x26d3 <= c0 && c0 <= 0x26d4) return true; - if (0x26e9 <= c0 && c0 <= 0x26ea) return true; - if (0x26f0 <= c0 && c0 <= 0x26f5) return true; - if (0x26f7 <= c0 && c0 <= 0x26fa) return true; - if (0x26fd == c0) return true; - if (0x2702 == c0) return true; - if (0x2705 == c0) return true; - if (0x2708 <= c0 && c0 <= 0x270d) return true; - if (0x270f == c0) return true; - if (0x2712 == c0) return true; - if (0x2714 == c0) return true; - if (0x2716 == c0) return true; - if (0x271d == c0) return true; - if (0x2721 == c0) return true; - if (0x2728 == c0) return true; - if (0x2733 <= c0 && c0 <= 0x2734) return true; - if (0x2744 == c0) return true; - if (0x2747 == c0) return true; - if (0x274c == c0) return true; - if (0x274e == c0) return true; - if (0x2753 <= c0 && c0 <= 0x2755) return true; - if (0x2757 == c0) return true; - if (0x2763 <= c0 && c0 <= 0x2764) return true; - if (0x2795 <= c0 && c0 <= 0x2797) return true; - if (0x27a1 == c0) return true; - if (0x27b0 == c0) return true; - if (0x27bf == c0) return true; - if (0x2934 <= c0 && c0 <= 0x2935) return true; - if (0x2b05 <= c0 && c0 <= 0x2b07) return true; - if (0x2b1b <= c0 && c0 <= 0x2b1c) return true; - if (0x2b50 == c0) return true; - if (0x2b55 == c0) return true; - if (0x3030 == c0) return true; - if (0x303d == c0) return true; - if (0x3297 == c0) return true; - if (0x3299 == c0) return true; - if (0x1f170 <= c0 && c0 <= 0x1f171) return true; - if (0x1f17e <= c0 && c0 <= 0x1f17f) return true; - if (0x1f202 == c0) return true; - if (0x1f237 == c0) return true; - if (0x1f321 == c0) return true; - if (0x1f324 <= c0 && c0 <= 0x1f32c) return true; - if (0x1f336 == c0) return true; - if (0x1f37d == c0) return true; - if (0x1f396 <= c0 && c0 <= 0x1f397) return true; - if (0x1f399 <= c0 && c0 <= 0x1f39b) return true; - if (0x1f39e <= c0 && c0 <= 0x1f39f) return true; - if (0x1f3cb <= c0 && c0 <= 0x1f3ce) return true; - if (0x1f3d4 <= c0 && c0 <= 0x1f3df) return true; - if (0x1f3f3 == c0) return true; - if (0x1f3f5 == c0) return true; - if (0x1f3f7 == c0) return true; - if (0x1f43f == c0) return true; - if (0x1f441 == c0) return true; - if (0x1f4fd == c0) return true; - if (0x1f549 <= c0 && c0 <= 0x1f54a) return true; - if (0x1f56f <= c0 && c0 <= 0x1f570) return true; - if (0x1f573 <= c0 && c0 <= 0x1f579) return true; - if (0x1f587 == c0) return true; - if (0x1f58a <= c0 && c0 <= 0x1f58d) return true; - if (0x1f590 == c0) return true; - if (0x1f5a5 == c0) return true; - if (0x1f5a8 == c0) return true; - if (0x1f5b1 <= c0 && c0 <= 0x1f5b2) return true; - if (0x1f5bc == c0) return true; - if (0x1f5c2 <= c0 && c0 <= 0x1f5c4) return true; - if (0x1f5d1 <= c0 && c0 <= 0x1f5d3) return true; - if (0x1f5dc <= c0 && c0 <= 0x1f5de) return true; - if (0x1f5e1 == c0) return true; - if (0x1f5e3 == c0) return true; - if (0x1f5e8 == c0) return true; - if (0x1f5ef == c0) return true; - if (0x1f5f3 == c0) return true; - if (0x1f5fa == c0) return true; - if (0x1f6cb == c0) return true; - if (0x1f6cd <= c0 && c0 <= 0x1f6cf) return true; - if (0x1f6e0 <= c0 && c0 <= 0x1f6e5) return true; - if (0x1f6e9 == c0) return true; - if (0x1f6f0 == c0) return true; - if (0x1f6f3 == c0) return true; - return false; + if (!(c1 == 0xfe0f || (0x1f3fb <= c1 && c1 <= 0x1f3ff))) return 0; + if (0xa9 == c0) return 2; + if (0xae == c0) return 2; + if (0x203c == c0) return 2; + if (0x2049 == c0) return 2; + if (0x2122 == c0) return 2; + if (0x2139 == c0) return 2; + if (0x2194 <= c0 && c0 <= 0x2199) return 2; + if (0x21a9 <= c0 && c0 <= 0x21aa) return 2; + if (0x231a <= c0 && c0 <= 0x231b) return 2; + if (0x2328 == c0) return 2; + if (0x23cf == c0) return 2; + if (0x23e9 <= c0 && c0 <= 0x23f3) return 2; + if (0x23f8 <= c0 && c0 <= 0x23fa) return 2; + if (0x24c2 == c0) return 2; + if (0x25aa <= c0 && c0 <= 0x25ab) return 2; + if (0x25b6 == c0) return 2; + if (0x25c0 == c0) return 2; + if (0x25fb <= c0 && c0 <= 0x25fe) return 2; + if (0x2600 <= c0 && c0 <= 0x2604) return 2; + if (0x260e == c0) return 2; + if (0x2611 == c0) return 2; + if (0x2614 <= c0 && c0 <= 0x2615) return 2; + if (0x2618 == c0) return 2; + if (0x261d == c0) return 2; + if (0x2620 == c0) return 2; + if (0x2622 <= c0 && c0 <= 0x2623) return 2; + if (0x2626 == c0) return 2; + if (0x262a == c0) return 2; + if (0x262e <= c0 && c0 <= 0x262f) return 2; + if (0x2638 <= c0 && c0 <= 0x263a) return 2; + if (0x2640 == c0) return 2; + if (0x2642 == c0) return 2; + if (0x2648 <= c0 && c0 <= 0x2653) return 2; + if (0x265f <= c0 && c0 <= 0x2660) return 2; + if (0x2663 == c0) return 2; + if (0x2665 <= c0 && c0 <= 0x2666) return 2; + if (0x2668 == c0) return 2; + if (0x267b == c0) return 2; + if (0x267e <= c0 && c0 <= 0x267f) return 2; + if (0x2692 <= c0 && c0 <= 0x2697) return 2; + if (0x2699 == c0) return 2; + if (0x269b <= c0 && c0 <= 0x269c) return 2; + if (0x26a0 <= c0 && c0 <= 0x26a1) return 2; + if (0x26a7 == c0) return 2; + if (0x26aa <= c0 && c0 <= 0x26ab) return 2; + if (0x26b0 <= c0 && c0 <= 0x26b1) return 2; + if (0x26bd <= c0 && c0 <= 0x26be) return 2; + if (0x26c4 <= c0 && c0 <= 0x26c5) return 2; + if (0x26c8 == c0) return 2; + if (0x26ce <= c0 && c0 <= 0x26cf) return 2; + if (0x26d1 == c0) return 2; + if (0x26d3 <= c0 && c0 <= 0x26d4) return 2; + if (0x26e9 <= c0 && c0 <= 0x26ea) return 2; + if (0x26f0 <= c0 && c0 <= 0x26f5) return 2; + if (0x26f7 <= c0 && c0 <= 0x26fa) return 2; + if (0x26fd == c0) return 2; + if (0x2702 == c0) return 2; + if (0x2705 == c0) return 2; + if (0x2708 <= c0 && c0 <= 0x270d) return 2; + if (0x270f == c0) return 2; + if (0x2712 == c0) return 2; + if (0x2714 == c0) return 2; + if (0x2716 == c0) return 2; + if (0x271d == c0) return 2; + if (0x2721 == c0) return 2; + if (0x2728 == c0) return 2; + if (0x2733 <= c0 && c0 <= 0x2734) return 2; + if (0x2744 == c0) return 2; + if (0x2747 == c0) return 2; + if (0x274c == c0) return 2; + if (0x274e == c0) return 2; + if (0x2753 <= c0 && c0 <= 0x2755) return 2; + if (0x2757 == c0) return 2; + if (0x2763 <= c0 && c0 <= 0x2764) return 2; + if (0x2795 <= c0 && c0 <= 0x2797) return 2; + if (0x27a1 == c0) return 2; + if (0x27b0 == c0) return 2; + if (0x27bf == c0) return 2; + if (0x2934 <= c0 && c0 <= 0x2935) return 2; + if (0x2b05 <= c0 && c0 <= 0x2b07) return 2; + if (0x2b1b <= c0 && c0 <= 0x2b1c) return 2; + if (0x2b50 == c0) return 2; + if (0x2b55 == c0) return 2; + if (0x3030 == c0) return 2; + if (0x303d == c0) return 2; + if (0x3297 == c0) return 2; + if (0x3299 == c0) return 2; + if (0x1f170 <= c0 && c0 <= 0x1f171) return 2; + if (0x1f17e <= c0 && c0 <= 0x1f17f) return 2; + if (0x1f202 == c0) return 2; + if (0x1f237 == c0) return 2; + if (0x1f321 == c0) return 2; + if (0x1f324 <= c0 && c0 <= 0x1f32c) return 2; + if (0x1f336 == c0) return 2; + if (0x1f37d == c0) return 2; + if (0x1f396 <= c0 && c0 <= 0x1f397) return 2; + if (0x1f399 <= c0 && c0 <= 0x1f39b) return 2; + if (0x1f39e <= c0 && c0 <= 0x1f39f) return 2; + if (0x1f3cb <= c0 && c0 <= 0x1f3ce) return 2; + if (0x1f3d4 <= c0 && c0 <= 0x1f3df) return 2; + if (0x1f3f3 == c0) return 2; + if (0x1f3f5 == c0) return 2; + if (0x1f3f7 == c0) return 2; + if (0x1f43f == c0) return 2; + if (0x1f441 == c0) return 2; + if (0x1f4fd == c0) return 2; + if (0x1f549 <= c0 && c0 <= 0x1f54a) return 2; + if (0x1f56f <= c0 && c0 <= 0x1f570) return 2; + if (0x1f573 <= c0 && c0 <= 0x1f579) return 2; + if (0x1f587 == c0) return 2; + if (0x1f58a <= c0 && c0 <= 0x1f58d) return 2; + if (0x1f590 == c0) return 2; + if (0x1f5a5 == c0) return 2; + if (0x1f5a8 == c0) return 2; + if (0x1f5b1 <= c0 && c0 <= 0x1f5b2) return 2; + if (0x1f5bc == c0) return 2; + if (0x1f5c2 <= c0 && c0 <= 0x1f5c4) return 2; + if (0x1f5d1 <= c0 && c0 <= 0x1f5d3) return 2; + if (0x1f5dc <= c0 && c0 <= 0x1f5de) return 2; + if (0x1f5e1 == c0) return 2; + if (0x1f5e3 == c0) return 2; + if (0x1f5e8 == c0) return 2; + if (0x1f5ef == c0) return 2; + if (0x1f5f3 == c0) return 2; + if (0x1f5fa == c0) return 2; + if (0x1f6cb == c0) return 2; + if (0x1f6cd <= c0 && c0 <= 0x1f6cf) return 2; + if (0x1f6e0 <= c0 && c0 <= 0x1f6e5) return 2; + if (0x1f6e9 == c0) return 2; + if (0x1f6f0 == c0) return 2; + if (0x1f6f3 == c0) return 2; + return 0; } } diff --git a/src/StrUtils.h b/src/StrUtils.h index 4b1bab8b..b6a7b2ac 100644 --- a/src/StrUtils.h +++ b/src/StrUtils.h @@ -663,6 +663,7 @@ namespace kiwi if (tagStr == u"W_HASHTAG") return POSTag::w_hashtag; if (tagStr == u"W_MENTION") return POSTag::w_mention; if (tagStr == u"W_SERIAL") return POSTag::w_serial; + if (tagStr == u"W_EMOJI") return POSTag::w_emoji; if (tagStr == u"USER0") return POSTag::user0; if (tagStr == u"USER1") return POSTag::user1; diff --git a/src/Utils.cpp b/src/Utils.cpp index 8112747f..428a8096 100644 --- a/src/Utils.cpp +++ b/src/Utils.cpp @@ -311,7 +311,7 @@ namespace kiwi "VCP", "VCN", "SF", "SP", "SS", "SSO", "SSC", "SE", "SO", "SW", "SB", "SL", "SH", "SN", - "W_URL", "W_EMAIL", "W_MENTION", "W_HASHTAG", "W_SERIAL", + "W_URL", "W_EMAIL", "W_MENTION", "W_HASHTAG", "W_SERIAL", "W_EMOJI", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "EP", "EF", "EC", "ETN", "ETM", "Z_CODA", @@ -353,7 +353,7 @@ namespace kiwi u"VCP", u"VCN", u"SF", u"SP", u"SS", u"SSO", u"SSC", u"SE", u"SO", u"SW", u"SB", u"SL", u"SH", u"SN", - u"W_URL", u"W_EMAIL", u"W_MENTION", u"W_HASHTAG", u"W_SERIAL", + u"W_URL", u"W_EMAIL", u"W_MENTION", u"W_HASHTAG", u"W_SERIAL", u"W_EMOJI", u"JKS", u"JKC", u"JKG", u"JKO", u"JKB", u"JKV", u"JKQ", u"JX", u"JC", u"EP", u"EF", u"EC", u"ETN", u"ETM", u"Z_CODA", From 3d5a78c69317f7aedd65032f527a1a5c488a410e Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 17:55:12 +0900 Subject: [PATCH 2/7] update test case for `emoji` --- test/test_cpp.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 9e008adb..86977fe1 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -189,19 +189,23 @@ TEST(KiwiCpp, Script) EXPECT_EQ(res[3].script, ScriptType::kana); res = kiwi.analyze(u"👍🏻👍🏿 👨‍👩‍👦 ℹ️ ✍🏼", Match::allWithNormalizing).first; - EXPECT_EQ(res.size(), 4); - EXPECT_EQ(res[0].tag, POSTag::sw); + EXPECT_EQ(res.size(), 5); + EXPECT_EQ(res[0].tag, POSTag::w_emoji); EXPECT_EQ(res[0].script, ScriptType::symbols_and_pictographs); EXPECT_EQ(res[0].position, 0); - EXPECT_EQ(res[0].length, 8); - EXPECT_EQ(res[1].tag, POSTag::sw); + EXPECT_EQ(res[0].length, 4); + EXPECT_EQ(res[1].tag, POSTag::w_emoji); EXPECT_EQ(res[1].script, ScriptType::symbols_and_pictographs); - EXPECT_EQ(res[1].position, 9); - EXPECT_EQ(res[1].length, 8); - EXPECT_EQ(res[2].tag, POSTag::sw); - EXPECT_EQ(res[2].script, ScriptType::letterlike_symbols); - EXPECT_EQ(res[3].tag, POSTag::sw); - EXPECT_EQ(res[3].script, ScriptType::dingbats); + EXPECT_EQ(res[1].position, 4); + EXPECT_EQ(res[1].length, 4); + EXPECT_EQ(res[2].tag, POSTag::w_emoji); + EXPECT_EQ(res[2].script, ScriptType::symbols_and_pictographs); + EXPECT_EQ(res[2].position, 9); + EXPECT_EQ(res[2].length, 8); + EXPECT_EQ(res[3].tag, POSTag::w_emoji); + EXPECT_EQ(res[3].script, ScriptType::letterlike_symbols); + EXPECT_EQ(res[4].tag, POSTag::w_emoji); + EXPECT_EQ(res[4].script, ScriptType::dingbats); } TEST(KiwiCpp, EmptyToken) From 391ce629327d8065cee167b800c84d583ccc49ee Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 17:55:28 +0900 Subject: [PATCH 3/7] update model files --- ModelGenerator/sj.knlm | 4 ++-- ModelGenerator/sj.morph | 4 ++-- ModelGenerator/skipbigram.mdl | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ModelGenerator/sj.knlm b/ModelGenerator/sj.knlm index 976375d7..dc38ba25 100644 --- a/ModelGenerator/sj.knlm +++ b/ModelGenerator/sj.knlm @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe03f0a1fdd25186113e8ec192df4369975fdf174426d46411bc5ff5c4d6caf2 -size 35867382 +oid sha256:e36c3d16e1f305169f977840e7257b80de153e613ac5809fd7e25ee59cda4f6e +size 35861136 diff --git a/ModelGenerator/sj.morph b/ModelGenerator/sj.morph index c32a6848..c4ac80a1 100644 --- a/ModelGenerator/sj.morph +++ b/ModelGenerator/sj.morph @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31aea323cf870f614fb07681aa01c76d83d1fa27e325d626d4995b4e064d5b05 -size 3581056 +oid sha256:76f155f03b402b866af32ed5e759560fac72b6174982ac5835f9d56c3a997e34 +size 3581100 diff --git a/ModelGenerator/skipbigram.mdl b/ModelGenerator/skipbigram.mdl index b4603597..41c69d39 100644 --- a/ModelGenerator/skipbigram.mdl +++ b/ModelGenerator/skipbigram.mdl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:152de02fa696817564fad589460d8874ce37ef3e912442c0ed37fb2955471288 -size 3186824 +oid sha256:edce9fbe938b9f21eeb915c0695c4c160708ed5879a6ce2e4dd20117ff7f9ca5 +size 3186748 From 0b1f8783e81100dbfe42b484dfe4e736c3bff957 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 17:56:41 +0900 Subject: [PATCH 4/7] Fix typo in Types.h file --- include/kiwi/Types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index 14e2f4c1..2c19675f 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -1,4 +1,4 @@ -/** +/** * @file Types.h * @author bab2min (bab2min@gmail.com) * @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일 From bb63bb348d81b37b4d755dec6f2f796d90d343b4 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 18:03:31 +0900 Subject: [PATCH 5/7] fix compilation errors --- include/kiwi/ScriptType.h | 2 +- src/ScriptType.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/kiwi/ScriptType.h b/include/kiwi/ScriptType.h index e6fdbcc7..cf47a932 100644 --- a/include/kiwi/ScriptType.h +++ b/include/kiwi/ScriptType.h @@ -246,5 +246,5 @@ namespace kiwi * * @return 0 if the character is not an emoji, 1 if c0 is an emoji, 2 if c0 and c1 are combined to form an emoji. */ - size_t isEmoji(char32_t c0, char32_t c1 = 0); + int isEmoji(char32_t c0, char32_t c1 = 0); } diff --git a/src/ScriptType.cpp b/src/ScriptType.cpp index b1e6db5d..7c23ca8a 100644 --- a/src/ScriptType.cpp +++ b/src/ScriptType.cpp @@ -1,4 +1,3 @@ -#pragma once #include namespace kiwi @@ -567,7 +566,7 @@ namespace kiwi return "unknown"; } - size_t isEmoji(char32_t c0, char32_t c1) + int isEmoji(char32_t c0, char32_t c1) { if (0x1f004 == c0) return 1; if (0x1f0cf == c0) return 1; From 84406ed1d44d0d1668e19686d0cb19e91fda572a Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 18:12:58 +0900 Subject: [PATCH 6/7] Update test case for `emoji` --- test/test_cpp.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 86977fe1..c3d85940 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -163,10 +163,17 @@ TEST(KiwiCpp, OldHangul) TEST(KiwiCpp, ChineseVsEmoji) { Kiwi& kiwi = reuseKiwiInstance(); - auto res = kiwi.analyze(u"韓𠀀𠀁𠀂𠀃🔥🤔🐶", Match::allWithNormalizing).first; + auto res = kiwi.analyze(u"韓𠀀𠀁𠀂𠀃🔥🤔🐶", Match::allWithNormalizing & ~Match::emoji).first; EXPECT_EQ(res.size(), 2); EXPECT_EQ(res[0].tag, POSTag::sh); EXPECT_EQ(res[1].tag, POSTag::sw); + + res = kiwi.analyze(u"韓𠀀𠀁𠀂𠀃🔥🤔🐶", Match::allWithNormalizing).first; + EXPECT_EQ(res.size(), 4); + EXPECT_EQ(res[0].tag, POSTag::sh); + EXPECT_EQ(res[1].tag, POSTag::w_emoji); + EXPECT_EQ(res[2].tag, POSTag::w_emoji); + EXPECT_EQ(res[3].tag, POSTag::w_emoji); } TEST(KiwiCpp, Script) From 8ce3c2193f078220c18c470196db104c5a335ba0 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 19 May 2024 18:25:21 +0900 Subject: [PATCH 7/7] add emoji tag to KiwiJava --- bindings/java/kr/pe/bab2min/Kiwi.java | 16 +++++++++------- bindings/java/kr/pe/bab2min/KiwiBuilder.java | 3 ++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/bindings/java/kr/pe/bab2min/Kiwi.java b/bindings/java/kr/pe/bab2min/Kiwi.java index 5aa1b54c..0f343779 100644 --- a/bindings/java/kr/pe/bab2min/Kiwi.java +++ b/bindings/java/kr/pe/bab2min/Kiwi.java @@ -21,6 +21,7 @@ public static class Match { hashtag = 1 << 2, mention = 1 << 3, serial = 1 << 4, + emoji = 1 << 5, normalizeCoda = 1 << 16, joinNounPrefix = 1 << 17, joinNounSuffix = 1 << 18, @@ -48,13 +49,13 @@ public static class POSTag { vcp = 19, vcn = 20, sf = 21, sp = 22, ss = 23, sso = 24, ssc = 25, se = 26, so = 27, sw = 28, sb = 29, sl = 30, sh = 31, sn = 32, - w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37, - jks = 38, jkc = 39, jkg = 40, jko = 41, jkb = 42, jkv = 43, jkq = 44, jx = 45, jc = 46, - ep = 47, ef = 48, ec = 49, etn = 50, etm = 51, - z_coda = 52, - user0 = 53, user1 = 54, user2 = 55, user3 = 56, user4 = 57, - p = 58, - max = 59, + w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37, w_emoji = 38, + jks = 39, jkc = 40, jkg = 41, jko = 42, jkb = 43, jkv = 44, jkq = 45, jx = 46, jc = 47, + ep = 48, ef = 49, ec = 50, etn = 51, etm = 52, + z_coda = 53, + user0 = 54, user1 = 55, user2 = 56, user3 = 57, user4 = 58, + p = 59, + max = 60, pv = p, pa = (byte)(p + 1), irregular = - 128, @@ -106,6 +107,7 @@ static String toString(byte tag) { case w_mention: return "W_MENTION"; case w_hashtag: return "W_HASHTAG"; case w_serial: return "W_SERIAL"; + case w_emoji: return "W_EMOJI"; case jks: return "JKS"; case jkc: return "JKC"; case jkg: return "JKG"; diff --git a/bindings/java/kr/pe/bab2min/KiwiBuilder.java b/bindings/java/kr/pe/bab2min/KiwiBuilder.java index 29d1c4e0..a440c602 100644 --- a/bindings/java/kr/pe/bab2min/KiwiBuilder.java +++ b/bindings/java/kr/pe/bab2min/KiwiBuilder.java @@ -8,7 +8,8 @@ public static class BuildOption { integrateAllomorph = 1 << 0, loadDefaultDict = 1 << 1, loadTypoDict = 1 << 2, - default_ = integrateAllomorph | loadDefaultDict | loadTypoDict; + loadMultiDict = 1 << 3, + default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict; } public static class AnalyzedMorph {