diff --git a/include/kiwi/PatternMatcher.h b/include/kiwi/PatternMatcher.h index c153fe89..af1ec4d8 100644 --- a/include/kiwi/PatternMatcher.h +++ b/include/kiwi/PatternMatcher.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include #include @@ -24,6 +24,7 @@ namespace kiwi joinAdvSuffix = 1 << 21, /**< 부사파생접미사(XSM)를 분리하지 않고 합쳐서 매칭한다 */ splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */ zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */ + compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */ joinVSuffix = joinVerbSuffix | joinAdjSuffix, joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix, all = url | email | hashtag | mention | serial | emoji | zCoda, diff --git a/include/kiwi/Utils.h b/include/kiwi/Utils.h index 2cf5674d..b2a76e81 100644 --- a/include/kiwi/Utils.h +++ b/include/kiwi/Utils.h @@ -58,9 +58,29 @@ namespace kiwi return within(chr, 0xAC00, 0xD7A4); } + inline bool isHangulOnset(char16_t chr) + { + return within(chr, 0x1100, 0x1100 + 19); + } + inline bool isHangulCoda(char16_t chr) { - return within(chr, 0x11A8, 0x11A7 + 28); + return within(chr, 0x11A8, 0x11A8 + 27); + } + + inline bool isHangulVowel(char16_t chr) + { + return within(chr, 0x314F, 0x3164); + } + + inline char16_t joinOnsetVowel(size_t onset, size_t vowel) + { + return 0xAC00 + (char16_t)((onset * 21 + vowel) * 28); + } + + inline int extractVowel(char16_t chr) + { + return ((chr - 0xAC00) / 28) % 21; } inline bool isOldHangulOnset(char16_t chr) @@ -88,6 +108,8 @@ namespace kiwi return within(chr, 0x3131, 0x314E) || within(chr, 0x3165, 0x3186); } + char16_t toCompatibleHangulConsonant(char16_t chr); + struct ComparatorIgnoringSpace { static bool less(const KString& a, const KString& b, const kchar_t space = u' '); diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index d1462091..25bf54f5 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -616,6 +616,14 @@ namespace kiwi } } + inline void toCompatibleJamo(u16string& str) + { + for (auto& c : str) + { + c = toCompatibleHangulConsonant(c); + } + } + inline void insertPathIntoResults( vector& ret, Vector& spStatesByRet, @@ -726,6 +734,12 @@ namespace kiwi } joined = joinHangul(s.str.empty() ? *s.morph->kform : s.str); } while (0); + + if (!!(matchOptions & Match::compatibleJamo)) + { + toCompatibleJamo(joined); + } + rarr.emplace_back(joined, s.morph->tag); auto& token = rarr.back(); token.morph = within(s.morph, pretokenizedGroup.morphemes) ? nullptr : s.morph; diff --git a/src/StrUtils.h b/src/StrUtils.h index 1268c3f5..b8e87954 100644 --- a/src/StrUtils.h +++ b/src/StrUtils.h @@ -728,21 +728,6 @@ namespace kiwi } } - inline bool isHangulOnset(char16_t c) - { - return u'ᄀ' <= c && c <= u'ᄒ'; - } - - inline bool isHangulVowel(char16_t c) - { - return u'ㅏ' <= c && c <= u'ㅣ'; - } - - inline char16_t joinOnsetVowel(size_t onset, size_t vowel) - { - return u'가' + (char16_t)((onset * 21 + vowel) * 28); - } - inline bool isChineseChr(char32_t c) { return (0x4E00 <= c && c <= 0x9FFF) diff --git a/src/Utils.cpp b/src/Utils.cpp index 428a8096..606f5155 100644 --- a/src/Utils.cpp +++ b/src/Utils.cpp @@ -498,4 +498,16 @@ namespace kiwi return ret; } + char16_t toCompatibleHangulConsonant(char16_t chr) + { + if (isHangulOnset(chr)) + { + return u"ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"[chr - 0x1100]; + } + else if (isHangulCoda(chr)) + { + return u"ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ"[chr - 0x11A8]; + } + return chr; + } } diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 90f19a8c..b78e3f79 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -1228,6 +1228,27 @@ TEST(KiwiCpp, JoinAffix) EXPECT_EQ(res5.first[5].str, u"배송되"); } +TEST(KiwiCpp, CompatibleJamo) +{ + Kiwi& kiwi = reuseKiwiInstance(); + auto res1 = kiwi.analyze(u"이긴다. 이김. 이길것.", Match::none).first; + EXPECT_EQ(res1.size(), 10); + EXPECT_EQ(res1[1].str, u"ᆫ다"); + EXPECT_EQ(res1[4].str, u"ᆷ"); + EXPECT_EQ(res1[7].str, u"ᆯ"); + + auto res2 = kiwi.analyze(u"이긴다. 이김. 이길것.", Match::compatibleJamo).first; + EXPECT_EQ(res2.size(), 10); + EXPECT_EQ(res2[1].str, u"ㄴ다"); + EXPECT_EQ(res2[4].str, u"ㅁ"); + EXPECT_EQ(res2[7].str, u"ㄹ"); + + auto res3 = kiwi.analyze(u"ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑᄒ ᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ", Match::compatibleJamo).first; + EXPECT_EQ(res3.size(), 2); + EXPECT_EQ(res3[0].str, u"ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"); + EXPECT_EQ(res3[1].str, u"ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ"); +} + TEST(KiwiCpp, AutoJoiner) { Kiwi& kiwi = reuseKiwiInstance();