From 40d53050e0ad6783040c72985c79d2d2cd67d264 Mon Sep 17 00:00:00 2001 From: bab2min Date: Wed, 20 Nov 2024 22:28:11 +0900 Subject: [PATCH 1/3] Fix duplicated candidates --- src/KTrie.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/KTrie.cpp b/src/KTrie.cpp index 99bd7762..f02e1965 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -818,15 +818,6 @@ size_t kiwi::splitByTrie( if (curNode->fail()) { curNode = curNode->fail(); - for (auto submatcher = curNode; submatcher; submatcher = submatcher->fail()) - { - const Form* cand = submatcher->val(trie); - if (!cand) break; - else if (!trie.hasSubmatch(cand)) - { - if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break; - } - } nextNode = curNode->template nextOpt(trie, str[n + i]); } else From 5f2e935bbee2a1ef6dc881693250b44a284f7561 Mon Sep 17 00:00:00 2001 From: bab2min Date: Wed, 20 Nov 2024 22:28:37 +0900 Subject: [PATCH 2/3] Add test case for bab2min/kiwipiepy#189 --- test/test_cpp.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 30414ab2..12c3ef46 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -1609,3 +1609,16 @@ TEST(KiwiCpp, IssueP172_LengthError) auto res = kiwi.analyze(text, Match::allWithNormalizing).first; EXPECT_GT(res.size(), 0); } + +TEST(KiwiCpp, IssueP189) +{ + Kiwi& kiwi = reuseKiwiInstance(); + auto res = kiwi.analyze(u"담아 1팩 무료", Match::allWithNormalizing).first; + + EXPECT_EQ(res.size(), 5); + EXPECT_EQ(res[0].str, u"담"); + EXPECT_EQ(res[1].str, u"어"); + EXPECT_EQ(res[2].str, u"1"); + EXPECT_EQ(res[3].str, u"팩"); + EXPECT_EQ(res[4].str, u"무료"); +} From 8a938f1d1f8eda50a77e32ae1c0fc56019a1385e Mon Sep 17 00:00:00 2001 From: bab2min Date: Wed, 20 Nov 2024 22:29:13 +0900 Subject: [PATCH 3/3] Add more test cases for Z_SIOT --- test/test_cpp.cpp | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index 12c3ef46..09567ba1 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -988,6 +988,12 @@ TEST(KiwiCpp, ZCoda) TEST(KiwiCpp, ZSiot) { Kiwi& kiwi = reuseKiwiInstance(); + + auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot); + EXPECT_EQ(resSplit.first.size(), 8); + EXPECT_EQ(resSplit.first[3].str, u"머리"); + EXPECT_EQ(resSplit.first[4].tag, POSTag::z_siot); + EXPECT_EQ(resSplit.first[5].str, u"결"); for (auto s : {u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방"}) { @@ -1014,6 +1020,35 @@ TEST(KiwiCpp, ZSiot) } } +TEST(KiwiCpp, ZSiotWithTypo) +{ + Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual)); + + for (auto s : { u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방" }) + { + auto resNone = kiwi.analyze(s, Match::allWithNormalizing); + auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot); + auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot); + EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; })); + EXPECT_EQ(resSplit.first.size(), 3); + EXPECT_EQ(resSplit.first[0].tag, POSTag::nng); + EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot); + EXPECT_EQ(resSplit.first[2].tag, POSTag::nng); + EXPECT_EQ(resMerge.first.size(), 1); + EXPECT_EQ(resMerge.first[0].tag, POSTag::nng); + } + + for (auto s : { u"발렛 파킹", u"미닛" }) + { + auto resNone = kiwi.analyze(s, Match::allWithNormalizing); + auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot); + auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot); + EXPECT_EQ(resNone.second, resSplit.second); + EXPECT_EQ(resNone.second, resMerge.second); + EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; })); + } +} + TEST(KiwiCpp, AnalyzeWithWordPosition) { std::u16string testSentence = u"나 정말 배불렄ㅋㅋ";