From 0720f40786f5a2e041866ac07510bb8837641cc6 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 22 Sep 2024 03:30:51 +0900 Subject: [PATCH 01/12] Improve beam search for accurate SB tag matching --- src/Kiwi.cpp | 21 +++++++++++++++++++-- src/PathEvaluator.hpp | 16 +++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/Kiwi.cpp b/src/Kiwi.cpp index 7e5aed9e..5a155e18 100644 --- a/src/Kiwi.cpp +++ b/src/Kiwi.cpp @@ -775,8 +775,25 @@ namespace kiwi spStateCnt[r.curState]++; validTarget++; } - ret.erase(ret.begin() + validTarget, ret.end()); - spStatesByRet.erase(spStatesByRet.begin() + validTarget, spStatesByRet.end()); + Vector idx(validTarget); + iota(idx.begin(), idx.end(), 0); + sort(idx.begin(), idx.end(), [&](size_t a, size_t b) { return ret[a].second > ret[b].second; }); + + Vector sortedRet; + Vector sortedSpStatesByRet; + const size_t maxCands = min(topN * 2, validTarget); + for (size_t i = 0; i < maxCands; ++i) + { + sortedRet.emplace_back(move(ret[idx[i]])); + sortedSpStatesByRet.emplace_back(spStatesByRet[idx[i]]); + } + ret.clear(); + spStatesByRet.clear(); + for (size_t i = 0; i < maxCands; ++i) + { + ret.emplace_back(move(sortedRet[i])); + spStatesByRet.emplace_back(sortedSpStatesByRet[i]); + } } inline void makePretokenizedSpanGroup( diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index 1c0e9993..8b4f21e2 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -212,7 +212,7 @@ namespace kiwi bool operator==(const PathHash& o) const { - return lmState == o.lmState && spState == o.spState; + return lmState == o.lmState && rootId == o.rootId && spState == o.spState; } }; @@ -1057,6 +1057,17 @@ namespace kiwi utils::ContainerSearcher> csearcher{ cache }; Vector ret; + size_t numUniqRootIdAndSpState; + { + UnorderedSet> uniqRootIdAndSpState; + for (auto& c : cand) + { + uniqRootIdAndSpState.emplace(c.rootId, (uint8_t)c.spState); + } + numUniqRootIdAndSpState = uniqRootIdAndSpState.size(); + } + + const size_t numCandsPerRootIdAndSpState = (size_t)std::ceil(topN * 2 / (double)numUniqRootIdAndSpState); size_t startIdx = 0; pair prevRootIdAndSpState; if (!cand.empty()) prevRootIdAndSpState = make_pair(cand[0].rootId, (uint8_t)cand[0].spState); @@ -1069,7 +1080,7 @@ namespace kiwi prevRootIdAndSpState = curRootIdAndSpState; } - if (i - startIdx < topN) + if (i - startIdx < numCandsPerRootIdAndSpState) { auto tokens = generateTokenList( &cand[i], csearcher, graph, ownFormList, kw->typoCostWeight, @@ -1082,7 +1093,6 @@ namespace kiwi { return a.score > b.score; }); - if (ret.size() > topN * 2) ret.erase(ret.begin() + topN * 2, ret.end()); return ret; } } From e9c5750331d5b39601b04eff649c71cf68d06d4b Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 22 Sep 2024 03:31:11 +0900 Subject: [PATCH 02/12] fix debug codes --- src/PathEvaluator.hpp | 51 ++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index 8b4f21e2..2fd17815 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -278,6 +278,20 @@ namespace kiwi } }; + template + inline std::ostream& printDebugPath(std::ostream& os, const WordLL& src) + { + if (src.parent) + { + printDebugPath(os, *src.parent); + } + + if (src.morpheme) src.morpheme->print(os); + else os << "NULL"; + os << " , "; + return os; + } + inline bool hasLeftBoundary(const KGraphNode* node) { // 시작 지점은 항상 왼쪽 경계로 처리 @@ -963,6 +977,16 @@ namespace kiwi } } +#ifdef DEBUG_PRINT + cerr << "Token[" << 0 << "]" << endl; + for (auto& tt : cache[0]) + { + cerr << "(" << tt.accScore << "):\t"; + printDebugPath(cerr, tt); + cerr << endl; + } +#endif + // middle nodes for (size_t i = 1; i < graphSize - 1; ++i) { @@ -993,17 +1017,13 @@ namespace kiwi } #ifdef DEBUG_PRINT - cout << "== " << i << " ==" << endl; + cerr << "Token[" << i << "]" << endl; for (auto& tt : cache[i]) { - cout << tt.accScore << '\t'; - for (auto& m : tt.morphs) - { - kw->morphemes[m.wid].print(cout) << '\t'; - } - cout << endl; + cerr << "(" << tt.accScore << "):\t"; + printDebugPath(cerr, tt); + cerr << endl; } - cout << "========" << endl; #endif } @@ -1041,18 +1061,13 @@ namespace kiwi ); #ifdef DEBUG_PRINT - cout << "== LAST ==" << endl; + cerr << "Token[last]" << endl; for (auto& tt : cache.back()) { - cout << tt.accScore << '\t'; - for (auto& m : tt.morphs) - { - kw->morphemes[m.wid].print(cout) << '\t'; - } - cout << endl; + cerr << "(" << tt.accScore << "):\t"; + printDebugPath(cerr, tt); + cerr << endl; } - cout << "========" << endl; - #endif utils::ContainerSearcher> csearcher{ cache }; From c0d940ffdc6681216bb78de8f6817be09fffb8fe Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 22 Sep 2024 03:32:08 +0900 Subject: [PATCH 03/12] Implement `NgramExtractor` --- include/kiwi/SubstringExtractor.h | 45 ++++ src/SubstringExtractor.cpp | 381 ++++++++++++++++++++++++++++++ 2 files changed, 426 insertions(+) diff --git a/include/kiwi/SubstringExtractor.h b/include/kiwi/SubstringExtractor.h index ec71a48e..67115867 100644 --- a/include/kiwi/SubstringExtractor.h +++ b/include/kiwi/SubstringExtractor.h @@ -62,4 +62,49 @@ namespace kiwi size_t cluster(size_t i) const; float score(size_t i) const; }; + + class Kiwi; + + class NgramExtractor + { + const Kiwi* kiwi = nullptr; + bool gatherLmScore = true; + UnorderedMap morph2id; + Vector id2morph; + Vector buf; + Vector scores; + Vector docBoundaries; + Vector positions; + Vector rawDocs; + + size_t addTokens(const std::vector& tokens); + + public: + struct Candidate + { + std::u16string text; + std::vector tokens; + std::vector tokenScores; + size_t cnt = 0; + size_t df = 0; + float score = 0; + float npmi = 0; + float leftBranch = 0; + float rightBranch = 0; + float lmScore = 0; + }; + + NgramExtractor(); + NgramExtractor(const Kiwi& kiwi, bool gatherLmScore = true); + NgramExtractor(const NgramExtractor&); + NgramExtractor(NgramExtractor&&) noexcept; + NgramExtractor& operator=(const NgramExtractor&); + NgramExtractor& operator=(NgramExtractor&&) noexcept; + ~NgramExtractor(); + + size_t addText(const std::u16string& text); + size_t addTexts(const U16Reader& reader); + + std::vector extract(size_t maxCandidates = 1000, size_t minCnt = 10, size_t maxLength = 5, float minScore = 1e-3, size_t numWorkers = 1) const; + }; } diff --git a/src/SubstringExtractor.cpp b/src/SubstringExtractor.cpp index cc41541b..1ac6d5e4 100644 --- a/src/SubstringExtractor.cpp +++ b/src/SubstringExtractor.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -420,4 +421,384 @@ namespace kiwi if (i >= clusterSize || clusterScores[i].first == (uint32_t)-1) return 0; return clusterScores[i].second; } + + NgramExtractor::NgramExtractor(const Kiwi& kiwi, bool _gatherLmScore) + : kiwi(&kiwi), id2morph(2), buf(1), scores(1), positions(1), gatherLmScore{ _gatherLmScore } + { + docBoundaries.emplace_back(1); + } + + NgramExtractor::NgramExtractor() = default; + NgramExtractor::NgramExtractor(const NgramExtractor&) = default; + NgramExtractor::NgramExtractor(NgramExtractor&&) noexcept = default; + NgramExtractor& NgramExtractor::operator=(const NgramExtractor&) = default; + NgramExtractor& NgramExtractor::operator=(NgramExtractor&&) noexcept = default; + NgramExtractor::~NgramExtractor() = default; + + inline u16string tokenToStr(const TokenInfo& token) + { + char tag; + if (clearIrregular(token.tag) == POSTag::xsv || clearIrregular(token.tag) == POSTag::xsa) + { + tag = 'V'; + } + else if (token.tag == POSTag::xsn || token.tag == POSTag::xpn || token.tag == POSTag::xr) + { + tag = 'N'; + } + else if (token.tag == POSTag::sn || token.tag == POSTag::sl || token.tag == POSTag::sh) + { + tag = 'F'; + } + else + { + tag = tagToString(token.tag)[0]; + } + u16string form; + form.reserve(1 + token.str.size()); + form.push_back(tag); + form += token.str; + return form; + } + + size_t NgramExtractor::addTokens(const std::vector& tokens) + { + for (auto& t : tokens) + { + auto str = tokenToStr(t); + auto inserted = morph2id.emplace(move(str), id2morph.size()); + auto id = inserted.first->second; + if (inserted.second) + { + id2morph.push_back(inserted.first->first); + } + int16_t score = (int16_t)max(min((int)round(t.score * 1024), 32767), -32768); + if (id < 0x4000) + { + buf.emplace_back(id); + if (gatherLmScore) scores.emplace_back(score); + positions.emplace_back(t.position); + } + else if (id < 0x10000000) + { + buf.emplace_back((id & 0x3FFF) | 0x4000); + buf.emplace_back((id >> 14) | 0x8000); + if (gatherLmScore) + { + scores.emplace_back(score); + scores.emplace_back(score); + } + positions.emplace_back(t.position); + positions.emplace_back(t.position); + } + } + buf.emplace_back(1); + if (gatherLmScore) + { + scores.emplace_back(0); + } + positions.emplace_back(tokens.empty() ? 0 : (tokens.back().position + tokens.back().length)); + docBoundaries.emplace_back(buf.size()); + return tokens.size(); + } + + size_t NgramExtractor::addText(const u16string& text) + { + auto res = kiwi->analyze(text, 1, Match::zCoda | Match::splitComplex); + rawDocs.emplace_back(text); + return addTokens(res[0].first); + } + + size_t NgramExtractor::addTexts(const U16Reader& reader) + { + size_t ret = 0; + kiwi->analyze(1, [&]() + { + auto str = reader(); + rawDocs.emplace_back(str); + return str; + }, [&](const std::vector& res) + { + ret += addTokens(res[0].first); + }, Match::zCoda | Match::splitComplex); + return ret; + } + + inline double computeBranchingEntropy(double total, double invalid, const Vector& branches) + { + double ret = 0; + for (auto& cnt : branches) + { + if (cnt <= 0) + { + continue; + } + + double p = cnt / total; + ret -= p * log(p); + if (invalid > 0) + { + p = cnt / (total - invalid) * (invalid / total); + ret -= p * log(p); + } + } + return ret; + } + + std::vector NgramExtractor::extract(size_t maxCandidates, size_t minCnt, size_t maxLength, float minScore, size_t numWorkers) const + { + unique_ptr threadPool; + unique_ptr mtx; + if (numWorkers > 1) + { + threadPool = make_unique(numWorkers); + mtx = make_unique(); + } + + Vector sa(buf.size()); + sais::sais((const char16_t*)buf.data(), sa.data(), buf.size(), 0, 0, nullptr, 0, nullptr, threadPool.get()); + + Vector revBuf; + revBuf.reserve(buf.size()); + revBuf.emplace_back(0); + revBuf.insert(revBuf.end(), buf.rbegin(), buf.rend()); + sais::FmIndex fi{ (const char16_t*)buf.data(), buf.size(), threadPool.get() }; + sais::FmIndex revFi{ (const char16_t*)revBuf.data(), revBuf.size(), threadPool.get() }; + Vector ngrams; + Vector unigramCnts(id2morph.size()); + + fi.enumSuffices(0, [&](const sais::FmIndex::SuffixTy& s, const sais::FmIndex::TraceTy& t) + { + auto u32size = s.size(); + for (size_t i = 0; i < s.size(); ++i) + { + if (s[i] & 0x8000) + { + u32size--; + } + } + + const auto suffixCnt = t.back().second - t.back().first; + if (u32size == 1) + { + if (s.size() == 1 && s[0] < 0x4000) + { + unigramCnts[s[0]] = suffixCnt; + } + else if (s.size() == 2 && (s[1] & 0x4000) && (s[0] & 0x8000)) + { + const auto merged = (s[1] & 0x3FFF) | ((s[0] & 0x3FFF) << 14); + unigramCnts[merged] = suffixCnt; + } + return true; + } + + if (u32size > maxLength) return false; + + if (find(s.begin(), s.end(), 0) != s.end() || find(s.begin(), s.end(), 1) != s.end()) + { + return false; + } + + if (s.front() & 0x4000) return false; + if (s.back() & 0x8000) return true; + + if (suffixCnt < minCnt) + { + return false; + } + + u16string reversed{ s.rbegin(), s.rend() }; + mp::OptionalLockGuard lock{ mtx.get() }; + Candidate cand; + cand.text = move(reversed); + cand.cnt = suffixCnt; + ngrams.emplace_back(move(cand)); + return true; + }, threadPool.get()); + + const double allTokenCnt = (double)accumulate(unigramCnts.begin(), unigramCnts.end(), (size_t)0); + + mp::runParallel(threadPool.get(), [&](const size_t start, const size_t numWorkers, mp::Barrier*) + { + for (size_t i = start; i < ngrams.size(); i += numWorkers) + { + auto& cand = ngrams[i]; + const double total = cand.cnt; + double invalidLeftCnts = 0, invalidRightCnts = 0; + thread_local Vector validLeftTokens, validRightTokens; + validLeftTokens.clear(); + validRightTokens.clear(); + fi.enumSufficesOfString(0, cand.text.rbegin(), cand.text.rend(), [&](const sais::FmIndex::SuffixTy& s, const sais::FmIndex::TraceTy& t) + { + if (s.size() != 1) return false; + const auto cnt = (double)(t.back().second - t.back().first); + if (s[0] > 1) + { + validLeftTokens.push_back(cnt); + } + else + { + invalidLeftCnts += cnt; + } + return false; + }); + + revFi.enumSufficesOfString(0, cand.text.begin(), cand.text.end(), [&](const sais::FmIndex::SuffixTy& s, const sais::FmIndex::TraceTy& t) + { + if (s.size() != 1) return false; + const auto cnt = (double)(t.back().second - t.back().first); + if (s[0] > 1) + { + validRightTokens.push_back(cnt); + } + else + { + invalidRightCnts += cnt; + } + return false; + }); + + cand.leftBranch = computeBranchingEntropy(total, invalidLeftCnts, validLeftTokens); + cand.rightBranch = computeBranchingEntropy(total, invalidRightCnts, validRightTokens); + + thread_local Vector restoredIds; + restoredIds.clear(); + for (auto rit = cand.text.begin(); rit != cand.text.end(); ++rit) + { + if ((*rit & 0x4000)) + { + const auto merged = (rit[0] & 0x3FFF) | ((rit[1] & 0x3FFF) << 14); + restoredIds.push_back(merged); + ++rit; + } + else if ((*rit & 0x8000)) + { + throw runtime_error("Invalid token"); + } + else + { + restoredIds.push_back(*rit); + } + } + cand.tokens.resize(restoredIds.size()); + for (size_t i = 0; i < restoredIds.size(); ++i) + { + cand.tokens[i] = id2morph[restoredIds[i]]; + } + + double pmi = log(cand.cnt / allTokenCnt); + for (auto id : restoredIds) + { + pmi -= log(unigramCnts[id] / allTokenCnt); + } + cand.npmi = pmi / log(allTokenCnt / cand.cnt) / (restoredIds.size() - 1); + const double maxBE = log((double)cand.cnt); + cand.score = cand.npmi * min(sqrt(cand.leftBranch * cand.rightBranch) / maxBE, 1.0); + + thread_local Vector trace; + if (!gatherLmScore) + { + const auto r = fi.findRange(cand.text.rbegin(), cand.text.rend()); + const size_t u16size = cand.text.size(); + cand.text.resize(12); + auto* ptr = reinterpret_cast(&cand.text[0]); + ptr[0] = r.first; + ptr[1] = r.second; + ptr[2] = u16size; + } + else if (fi.findTrace(trace, cand.text.rbegin(), cand.text.rend())) + { + cand.tokenScores.resize(cand.tokens.size()); + int totalAccum = 0; + size_t i = 0, t = cand.tokens.size(); + for (auto it = cand.text.rbegin(); it != cand.text.rend(); ++it, ++i) + { + if ((*it & 0x8000)) + { + continue; + } + + int tokenAccum = 0; + for (size_t j = i * cand.cnt; j < (i + 1) * cand.cnt; ++j) + { + totalAccum += scores[sa[trace[j]]]; + tokenAccum += scores[sa[trace[j]]]; + } + cand.tokenScores[--t] = (tokenAccum / 1024.f) / cand.cnt; + } + cand.lmScore = (totalAccum / 1024.f) / cand.cnt / cand.tokens.size(); + const size_t b = trace.rbegin()[cand.cnt - 1]; + const size_t e = trace.rbegin()[0] + 1; + const size_t u16size = cand.text.size(); + cand.text.resize(12); + auto* ptr = reinterpret_cast(&cand.text[0]); + ptr[0] = b; + ptr[1] = e; + ptr[2] = u16size; + } + else + { + cand.text.clear(); + } + } + }); + + fi = {}; + revFi = {}; + + sort(ngrams.begin(), ngrams.end(), [&](const Candidate& a, const Candidate& b) + { + return a.score > b.score; + }); + + const size_t numCandsGreaterThanMinScore = lower_bound(ngrams.begin(), ngrams.end(), minScore, [](const Candidate& a, float b) + { + return a.score > b; + }) - ngrams.begin(); + + maxCandidates = min(maxCandidates, numCandsGreaterThanMinScore); + + mp::runParallel(threadPool.get(), [&](const size_t start, const size_t numWorkers, mp::Barrier*) + { + const size_t end = min(maxCandidates, ngrams.size()); + for (size_t i = start; i < end; i += numWorkers) + { + auto& cand = ngrams[i]; + if (cand.text.empty()) continue; + auto* ptr = reinterpret_cast(&cand.text[0]); + const size_t b = ptr[0]; + const size_t e = ptr[1]; + const size_t u16size = ptr[2]; + thread_local UnorderedMap formCnt; + formCnt.clear(); + thread_local UnorderedSet docIds; + docIds.clear(); + for (size_t j = b; j < e; ++j) + { + const auto origIdx = sa[j]; + const size_t docId = upper_bound(docBoundaries.begin(), docBoundaries.end(), origIdx) - docBoundaries.begin() - 1; + docIds.emplace(docId); + const auto& text = rawDocs[docId]; + const size_t tokenStart = positions[origIdx]; + const size_t tokenEnd = positions[origIdx + u16size]; + auto form = text.substr(tokenStart, tokenEnd - tokenStart); + while (!form.empty() && isSpace(form.back())) + { + form.pop_back(); + } + formCnt[form]++; + } + cand.df = docIds.size(); + + auto it = max_element(formCnt.begin(), formCnt.end(), [](const pair& a, const pair& b) + { + return a.second < b.second; + }); + cand.text = move(it->first); + } + }); + + return { make_move_iterator(ngrams.begin()), make_move_iterator(ngrams.begin() + min(maxCandidates, ngrams.size())) }; + } } From 07ca549d00358d9488959d2502e3b260f8293134 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 22 Sep 2024 21:43:53 +0900 Subject: [PATCH 04/12] Optimize SB tag beam search --- src/PathEvaluator.hpp | 125 ++++++++++++++++++++++++++++-------------- src/Utils.cpp | 2 +- 2 files changed, 86 insertions(+), 41 deletions(-) diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index 2fd17815..ae98faf6 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -125,6 +125,7 @@ namespace kiwi size_t ownFormId, CandTy&& cands, bool unknownForm, + const Vector& prevSpStates, bool splitComplex = false, const std::unordered_set* blocklist = nullptr ); @@ -145,7 +146,8 @@ namespace kiwi const KGraphNode* startNode, const size_t topN, const float ignoreCondScore, - const float nodeLevelDiscount + const float nodeLevelDiscount, + const Vector& prevSpStates ); }; @@ -194,6 +196,8 @@ namespace kiwi } }; + static constexpr uint8_t commonRootId = -1; + template struct PathHash { @@ -447,6 +451,12 @@ namespace kiwi } }; + inline bool isQuote(Kiwi::SpecialMorph m) + { + return m == Kiwi::SpecialMorph::singleQuoteOpen || m == Kiwi::SpecialMorph::singleQuoteClose + || m == Kiwi::SpecialMorph::doubleQuoteOpen || m == Kiwi::SpecialMorph::doubleQuoteClose; + } + template void PathEvaluator::evalSingleMorpheme( Vector>& resultOut, @@ -463,13 +473,15 @@ namespace kiwi const KGraphNode* startNode, const size_t topN, const float ignoreCondScore, - const float nodeLevelDiscount + const float nodeLevelDiscount, + const Vector& prevSpStates ) { thread_local UnorderedMap, WordLL> bestPathes; // pair: [index, size] thread_local UnorderedMap, pair> bestPathIndex; thread_local Vector> bestPathValues; + thread_local Vector rootIds; if (top1) { bestPathes.clear(); @@ -568,10 +580,26 @@ namespace kiwi } } + if ((ruleBasedScorer.curMorphSbType || isQuote(ruleBasedScorer.curMorphSpecialType)) && prevPath.rootId == commonRootId) + { + rootIds.resize(prevSpStates.size()); + iota(rootIds.begin(), rootIds.end(), 0); + } + else + { + rootIds.resize(1); + rootIds[0] = commonRootId; + } + + for (auto rootId : rootIds) { const auto* prevMorpheme = &morphBase[prevPath.wid]; auto spState = prevPath.spState; - candScore += ruleBasedScorer(prevMorpheme, spState); + if (rootId != commonRootId) + { + spState = prevSpStates[rootId]; + } + const float candScoreWithRule = candScore + ruleBasedScorer(prevMorpheme, spState); // update special state if (ruleBasedScorer.curMorphSpecialType == Kiwi::SpecialMorph::singleQuoteOpen) spState.singleQuote = 1; @@ -586,12 +614,13 @@ namespace kiwi PathHash ph{ cLmState, prevPath.rootId, spState }; if (top1) { - WordLL newPath{ curMorph, candScore, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + WordLL newPath{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + if (rootId != commonRootId) newPath.rootId = rootId; auto inserted = bestPathes.emplace(ph, newPath); if (!inserted.second) { auto& target = inserted.first->second; - if (candScore > target.accScore) + if (candScoreWithRule > target.accScore) { target = newPath; } @@ -602,7 +631,8 @@ namespace kiwi auto inserted = bestPathIndex.emplace(ph, make_pair((uint32_t)bestPathValues.size(), 1)); if (inserted.second) { - bestPathValues.emplace_back(curMorph, candScore, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState); + bestPathValues.emplace_back(curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState); + if (rootId != commonRootId) bestPathValues.back().rootId = rootId; bestPathValues.resize(bestPathValues.size() + topN - 1); } else @@ -611,16 +641,18 @@ namespace kiwi auto bestPathLast = bestPathValues.begin() + inserted.first->second.first + inserted.first->second.second; if (distance(bestPathFirst, bestPathLast) < topN) { - *bestPathLast = WordLL{ curMorph, candScore, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + *bestPathLast = WordLL{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + if (rootId != commonRootId) bestPathLast->rootId = rootId; push_heap(bestPathFirst, bestPathLast + 1, WordLLGreater{}); ++inserted.first->second.second; } else { - if (candScore > bestPathFirst->accScore) + if (candScoreWithRule > bestPathFirst->accScore) { pop_heap(bestPathFirst, bestPathLast, WordLLGreater{}); - *(bestPathLast - 1) = WordLL{ curMorph, candScore, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + *(bestPathLast - 1) = WordLL{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; + if (rootId != commonRootId) (*(bestPathLast - 1)).rootId = rootId; push_heap(bestPathFirst, bestPathLast, WordLLGreater{}); } } @@ -691,6 +723,7 @@ namespace kiwi size_t ownFormId, CandTy&& cands, bool unknownForm, + const Vector& prevSpStates, bool splitComplex, const std::unordered_set* blocklist ) @@ -793,11 +826,15 @@ namespace kiwi if (topN == 1) { - evalSingleMorpheme(nCache, kw, ownFormList, cache, seq, oseq, chSize, combSocket, ownFormId, curMorph, node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount); + evalSingleMorpheme(nCache, kw, ownFormList, cache, + seq, oseq, chSize, combSocket, ownFormId, curMorph, + node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount, prevSpStates); } else { - evalSingleMorpheme(nCache, kw, ownFormList, cache, seq, oseq, chSize, combSocket, ownFormId, curMorph, node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount); + evalSingleMorpheme(nCache, kw, ownFormList, cache, + seq, oseq, chSize, combSocket, ownFormId, curMorph, + node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount, prevSpStates); } } @@ -806,14 +843,15 @@ namespace kiwi thread_local Vector maxScores; maxScores.clear(); - maxScores.resize(cache[0].size() * topN, -INFINITY); + maxScores.resize((1 + prevSpStates.size()) * topN, -INFINITY); if (topN == 1) { for (auto& c : nCache) { if (c.morpheme->combineSocket) continue; - maxScores[c.rootId] = max(maxScores[c.rootId], c.accScore); + const auto rootId = c.rootId == commonRootId ? 0 : c.rootId + 1; + maxScores[rootId] = max(maxScores[rootId], c.accScore); } } else @@ -821,11 +859,12 @@ namespace kiwi for (auto& c : nCache) { if (c.morpheme->combineSocket) continue; - if (c.accScore > maxScores[c.rootId * topN]) + const auto rootId = c.rootId == commonRootId ? 0 : c.rootId + 1; + if (c.accScore > maxScores[rootId * topN]) { - pop_heap(maxScores.begin() + c.rootId * topN, maxScores.begin() + (c.rootId + 1) * topN, greater{}); - maxScores[c.rootId * topN + topN - 1] = c.accScore; - push_heap(maxScores.begin() + c.rootId * topN, maxScores.begin() + (c.rootId + 1) * topN, greater{}); + pop_heap(maxScores.begin() + rootId * topN, maxScores.begin() + (rootId + 1) * topN, greater{}); + maxScores[rootId * topN + topN - 1] = c.accScore; + push_heap(maxScores.begin() + rootId * topN, maxScores.begin() + (rootId + 1) * topN, greater{}); } } } @@ -833,7 +872,8 @@ namespace kiwi size_t validCount = 0; for (size_t i = 0; i < nCache.size(); ++i) { - if (nCache[i].accScore + kw->cutOffThreshold < maxScores[nCache[i].rootId * topN]) continue; + const auto rootId = nCache[i].rootId == commonRootId ? 0 : nCache[i].rootId + 1; + if (nCache[i].accScore + kw->cutOffThreshold < maxScores[rootId * topN]) continue; if (validCount != i) nCache[validCount] = move(nCache[i]); validCount++; } @@ -842,7 +882,7 @@ namespace kiwi template - inline pair*> generateTokenList(const WordLL* result, + inline PathEvaluator::Path generateTokenList(const WordLL* result, const utils::ContainerSearcher>& csearcher, const KGraphNode* graph, const Vector& ownFormList, @@ -930,7 +970,7 @@ namespace kiwi } prev = cur; } - return make_pair(ret, steps.back()->parent); + return ret; } template @@ -959,24 +999,18 @@ namespace kiwi unknownNodeCands.emplace_back(kw->getDefaultMorpheme(POSTag::nnp)); unknownNodeLCands.emplace_back(kw->getDefaultMorpheme(POSTag::nnp)); - // start node + auto uniqStates = prevSpStates; + sort(uniqStates.begin(), uniqStates.end()); + uniqStates.erase(unique(uniqStates.begin(), uniqStates.end()), uniqStates.end()); if (prevSpStates.empty()) { - cache[0].emplace_back(&kw->morphemes[0], 0.f, 0.f, nullptr, LmState{ kw->langMdl }, SpecialState{}); - } - else - { - auto uniqStates = prevSpStates; - sort(uniqStates.begin(), uniqStates.end()); - uniqStates.erase(unique(uniqStates.begin(), uniqStates.end()), uniqStates.end()); - for (auto& spState : uniqStates) - { - uint8_t rootId = cache[0].size(); - cache[0].emplace_back(&kw->morphemes[0], 0.f, 0.f, nullptr, LmState{ kw->langMdl }, spState); - cache[0].back().rootId = rootId; - } + uniqStates.emplace_back(); } + // start node + cache[0].emplace_back(&kw->morphemes[0], 0.f, 0.f, nullptr, LmState{ kw->langMdl }, SpecialState{}); + cache[0].back().rootId = commonRootId; + #ifdef DEBUG_PRINT cerr << "Token[" << 0 << "]" << endl; for (auto& tt : cache[0]) @@ -1000,7 +1034,7 @@ namespace kiwi if (node->form) { - evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, node->form->candidate, false, splitComplex, blocklist); + evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, node->form->candidate, false, uniqStates, splitComplex, blocklist); if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m) { return m->combineSocket || (!m->chunks.empty() && !m->complex); @@ -1008,12 +1042,12 @@ namespace kiwi { ownFormList.emplace_back(node->form->form); ownFormId = ownFormList.size(); - evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeLCands, true, splitComplex, blocklist); + evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeLCands, true, uniqStates, splitComplex, blocklist); }; } else { - evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeCands, true, splitComplex, blocklist); + evalPath(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeCands, true, uniqStates, splitComplex, blocklist); } #ifdef DEBUG_PRINT @@ -1028,6 +1062,7 @@ namespace kiwi } // end node + auto& cand = cache.back(); for (auto prev = endNode->getPrev(); prev; prev = prev->getSibling()) { for (auto& p : cache[prev - startNode]) @@ -1044,11 +1079,21 @@ namespace kiwi float c = p.accScore + (openEnd ? 0 : p.lmState.next(kw->langMdl, eosId)); if (p.spState.singleQuote) c -= 2; if (p.spState.doubleQuote) c -= 2; - cache.back().emplace_back(nullptr, c, p.accTypoCost, &p, p.lmState, p.spState); + if (p.rootId == commonRootId) + { + for (size_t i = 0; i < uniqStates.size(); ++i) + { + cand.emplace_back(nullptr, c, p.accTypoCost, &p, p.lmState, uniqStates[i]); + cand.back().rootId = i; + } + } + else + { + cand.emplace_back(nullptr, c, p.accTypoCost, &p, p.lmState, p.spState); + } } } - auto& cand = cache.back(); sort(cand.begin(), cand.end(), [](const WordLL& a, const WordLL& b) { @@ -1101,7 +1146,7 @@ namespace kiwi &cand[i], csearcher, graph, ownFormList, kw->typoCostWeight, kw->morphemes.data(), langVocabSize ); - ret.emplace_back(move(tokens.first), cand[i].accScore, tokens.second->spState, cand[i].spState); + ret.emplace_back(move(tokens), cand[i].accScore, uniqStates[cand[i].rootId], cand[i].spState); } } sort(ret.begin(), ret.end(), [](const ChunkResult& a, const ChunkResult& b) diff --git a/src/Utils.cpp b/src/Utils.cpp index 606f5155..e0fa120e 100644 --- a/src/Utils.cpp +++ b/src/Utils.cpp @@ -71,7 +71,7 @@ namespace kiwi if (isSpace(chr)) return POSTag::unknown; if (0x2000 <= chr && chr <= 0x200F) return POSTag::unknown; - if (iswdigit(chr)) return POSTag::sn; + if ('0' <= chr && chr <= '9') return POSTag::sn; if (('A' <= chr && chr <= 'Z') || ('a' <= chr && chr <= 'z')) return POSTag::sl; if (0xAC00 <= chr && chr < 0xD7A4) return POSTag::max; From 61bf5de05e2194e6a3755158f5b086e13c2fced5 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sun, 22 Sep 2024 21:44:17 +0900 Subject: [PATCH 05/12] Optimize `splitByTrie` --- src/KTrie.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/KTrie.cpp b/src/KTrie.cpp index 83593fc8..023b536a 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -972,16 +972,6 @@ size_t kiwi::splitByTrie( if (curNode->fail()) { curNode = curNode->fail(); - for (auto submatcher = curNode; submatcher; submatcher = submatcher->fail()) - { - const Form* cand = submatcher->val(trie); - if (!cand) break; - else if (!trie.hasSubmatch(cand)) - { - zCodaFollowable = zCodaFollowable || getZCodaAppendable(cand, formBase); - if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break; - } - } nextNode = curNode->template nextOpt(trie, c); } else From d931790959b63f1bbea4ac5223fc91e088ec3cbf Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 23 Sep 2024 23:44:44 +0900 Subject: [PATCH 06/12] Fix wrong end position of lengthened typo --- src/PathEvaluator.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index ae98faf6..180f8d57 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -950,6 +950,7 @@ namespace kiwi &gNode - graph ); } + ret.back().end = gNode.endPos; } else { @@ -967,6 +968,7 @@ namespace kiwi &gNode - graph ); } + ret.back().end = gNode.endPos; } prev = cur; } From cf453d903ff6c53e0ec41e44b3236172d932be90 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 23 Sep 2024 23:45:32 +0900 Subject: [PATCH 07/12] Fixed lengthening typo cost scales --- src/KTrie.cpp | 2 +- src/TypoTransformer.cpp | 2 +- test/test_typo.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/KTrie.cpp b/src/KTrie.cpp index 023b536a..8cf42d02 100644 --- a/src/KTrie.cpp +++ b/src/KTrie.cpp @@ -1151,7 +1151,7 @@ size_t kiwi::splitByTrie( const Form* cand = node.second->val(trie); if (cand && !trie.hasSubmatch(cand)) { - insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * node.first, node.first); + insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * (3 + node.first), node.first); } } } diff --git a/src/TypoTransformer.cpp b/src/TypoTransformer.cpp index 96da2813..a6386c66 100644 --- a/src/TypoTransformer.cpp +++ b/src/TypoTransformer.cpp @@ -660,7 +660,7 @@ namespace kiwi static const TypoTransformer basicTypoSetWithContinual = basicTypoSet | continualTypoSet; - static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.5f); + static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.25f); switch (set) { diff --git a/test/test_typo.cpp b/test/test_typo.cpp index 2b6a004d..38491bce 100644 --- a/test/test_typo.cpp +++ b/test/test_typo.cpp @@ -209,17 +209,17 @@ TEST(KiwiTypo, LengtheningTypoSet) { KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, }; Kiwi typoKiwi = builder.build(DefaultTypoSet::lengtheningTypoSet); - const float typoCost = typoKiwi.getTypoCostWeight() * 0.5f; + const float typoCost = typoKiwi.getTypoCostWeight() * 0.25f; auto ref = typoKiwi.analyze(u"진짜?", Match::allWithNormalizing); auto res = typoKiwi.analyze(u"지인짜?", Match::allWithNormalizing); - EXPECT_FLOAT_EQ(ref.second - typoCost, res.second); + EXPECT_FLOAT_EQ(ref.second - 4 * typoCost, res.second); EXPECT_EQ(res.first.size(), 2); EXPECT_EQ(res.first[0].str, u"진짜"); EXPECT_EQ(res.first[1].str, u"?"); res = typoKiwi.analyze(u"지인짜아?", Match::allWithNormalizing); - EXPECT_FLOAT_EQ(ref.second - 2 * typoCost, res.second); + EXPECT_FLOAT_EQ(ref.second - 5 * typoCost, res.second); EXPECT_EQ(res.first.size(), 2); EXPECT_EQ(res.first[0].str, u"진짜"); EXPECT_EQ(res.first[1].str, u"?"); From 584a4430cfecd102a1afd5084e790e9d29b229b0 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 23 Sep 2024 23:46:53 +0900 Subject: [PATCH 08/12] Fix parallelization bug at `mp::runParallel` --- src/sais/fm_index.hpp | 85 +++++++++++++++++++++++++++++++++++++++++++ src/sais/mp_utils.hpp | 2 +- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/src/sais/fm_index.hpp b/src/sais/fm_index.hpp index 5c21aca9..dada2d14 100644 --- a/src/sais/fm_index.hpp +++ b/src/sais/fm_index.hpp @@ -85,6 +85,78 @@ namespace sais return std::make_pair(b, e); } + template + std::pair findRange(It first, It last) const + { + if (first == last) return std::make_pair(0, 0); + + std::pair range = initRange(*first); + if (range.first == 0 && range.second == 0) + { + return range; + } + ++first; + for (; first != last; ++first) + { + range = nextRange(range, *first); + if (range.first == 0 && range.second == 0) + { + return range; + } + } + return range; + } + + template + bool findTrace(std::vector& out, It first, It last) const + { + out.clear(); + if (first == last) return false; + std::pair range = initRange(*first); + ++first; + for (; first != last; ++first) + { + auto nextChr = *first; + auto it = std::lower_bound(cKeys.get(), cKeys.get() + vocabSize, nextChr); + if (it == cKeys.get() + vocabSize || *it != nextChr) return false; + const size_t b = cValues[it - cKeys.get()]; + const size_t ob = waveletTree.rank(nextChr, range.first); + const size_t oe = waveletTree.rank(nextChr, range.second); + if (ob > oe) return false; + + const size_t rSize = range.second - range.first; + const size_t numHistories = out.size() / rSize; + size_t validIdx = 0; + for (size_t h = 0; h < numHistories; ++h) + { + for (size_t i = range.first; i < range.second; ++i) + { + if (bwtData[i] == nextChr) + { + out[validIdx] = out[h * rSize + i - range.first]; + validIdx++; + } + } + } + out.resize(validIdx); + + for (size_t i = range.first; i < range.second; ++i) + { + if (bwtData[i] == nextChr) + { + out.emplace_back(i); + } + } + + range = std::make_pair(b + ob, b + oe); + } + for (size_t i = range.first; i < range.second; ++i) + { + out.emplace_back(i); + } + return true; + } + using SuffixTy = std::basic_string; using TraceTy = std::vector>; @@ -113,6 +185,19 @@ namespace sais return ret; } + template + size_t enumSufficesOfString(size_t minCnt, It first, It last, Fn&& fn) const + { + std::pair range = findRange(first, last); + if (range.first == 0 && range.second == 0) + { + return 0; + } + SuffixTy suffix; + TraceTy trace; + return enumSuffices(minCnt, suffix, trace, range.first, range.second, std::move(fn)); + } + template size_t enumSuffices(size_t minCnt, Fn&& fn, mp::ThreadPool* tp = nullptr) const { diff --git a/src/sais/mp_utils.hpp b/src/sais/mp_utils.hpp index 374f6054..ccec344a 100644 --- a/src/sais/mp_utils.hpp +++ b/src/sais/mp_utils.hpp @@ -119,7 +119,7 @@ namespace mp std::bind(std::forward(f), std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::forward(args)...)); if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); ret.emplace_back(task->get_future()); - tasks[i].emplace([&, task, b](size_t id) { (*task)(id, std::min(workers, tasks.size()), b.get()); }); + tasks[i].emplace([&, task, workers, b](size_t id) { (*task)(id, std::min(workers, tasks.size()), b.get()); }); } } condition.notify_all(); From 1236483614839a113a0c37b208e20d587e58ae50 Mon Sep 17 00:00:00 2001 From: bab2min Date: Sat, 28 Sep 2024 19:43:25 +0900 Subject: [PATCH 09/12] Optimize `PathEvaluator::evalSingleMorpheme` --- src/PathEvaluator.hpp | 413 +++++++++++++++++++++++++----------------- 1 file changed, 246 insertions(+), 167 deletions(-) diff --git a/src/PathEvaluator.hpp b/src/PathEvaluator.hpp index 180f8d57..79c01fc0 100644 --- a/src/PathEvaluator.hpp +++ b/src/PathEvaluator.hpp @@ -48,6 +48,13 @@ namespace kiwi using Wid = uint32_t; + enum class PathEvaluatingMode + { + topN, + top1, + top1Small, + }; + class PathEvaluator { public: @@ -130,16 +137,12 @@ namespace kiwi const std::unordered_set* blocklist = nullptr ); - template + template static void evalSingleMorpheme( Vector>& resultOut, const Kiwi* kw, const Vector& ownForms, const Vector>>& cache, - array seq, - array oseq, - size_t chSize, - uint8_t combSocket, size_t ownFormId, const Morpheme* curMorph, const KGraphNode* node, @@ -457,16 +460,181 @@ namespace kiwi || m == Kiwi::SpecialMorph::doubleQuoteOpen || m == Kiwi::SpecialMorph::doubleQuoteClose; } - template + template + class BestPathConatiner; + + template + class BestPathConatiner + { + // pair: [index, size] + UnorderedMap, pair> bestPathIndex; + Vector> bestPathValues; + public: + inline void clear() + { + bestPathIndex.clear(); + bestPathValues.clear(); + } + + inline void insert(const PathHash& ph, size_t topN, uint8_t rootId, + const Morpheme* morph, float accScore, float accTypoCost, const WordLL* parent, LmState&& lmState, SpecialState spState) + { + auto inserted = bestPathIndex.emplace(ph, make_pair((uint32_t)bestPathValues.size(), 1)); + if (inserted.second) + { + bestPathValues.emplace_back(morph, accScore, accTypoCost, parent, move(lmState), spState); + if (rootId != commonRootId) bestPathValues.back().rootId = rootId; + bestPathValues.resize(bestPathValues.size() + topN - 1); + } + else + { + auto bestPathFirst = bestPathValues.begin() + inserted.first->second.first; + auto bestPathLast = bestPathValues.begin() + inserted.first->second.first + inserted.first->second.second; + if (distance(bestPathFirst, bestPathLast) < topN) + { + *bestPathLast = WordLL{ morph, accScore, accTypoCost, parent, move(lmState), spState }; + if (rootId != commonRootId) bestPathLast->rootId = rootId; + push_heap(bestPathFirst, bestPathLast + 1, WordLLGreater{}); + ++inserted.first->second.second; + } + else + { + if (accScore > bestPathFirst->accScore) + { + pop_heap(bestPathFirst, bestPathLast, WordLLGreater{}); + *(bestPathLast - 1) = WordLL{ morph, accScore, accTypoCost, parent, move(lmState), spState }; + if (rootId != commonRootId) (*(bestPathLast - 1)).rootId = rootId; + push_heap(bestPathFirst, bestPathLast, WordLLGreater{}); + } + } + } + } + + inline void writeTo(Vector>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId) + { + for (auto& p : bestPathIndex) + { + const auto index = p.second.first; + const auto size = p.second.second; + for (size_t i = 0; i < size; ++i) + { + resultOut.emplace_back(move(bestPathValues[index + i])); + auto& newPath = resultOut.back(); + + // fill the rest information of resultOut + newPath.wid = lastSeqId; + if (curMorph->chunks.empty() || curMorph->complex) + { + newPath.combineSocket = curMorph->combineSocket; + newPath.ownFormId = ownFormId; + } + } + } + } + }; + + template + class BestPathConatiner + { + UnorderedMap, WordLL> bestPathes; + public: + inline void clear() + { + bestPathes.clear(); + } + + inline void insert(const PathHash& ph, size_t topN, uint8_t rootId, + const Morpheme* morph, float accScore, float accTypoCost, const WordLL* parent, LmState&& lmState, SpecialState spState) + { + WordLL newPath{ morph, accScore, accTypoCost, parent, move(lmState), spState }; + if (rootId != commonRootId) newPath.rootId = rootId; + auto inserted = bestPathes.emplace(ph, newPath); + if (!inserted.second) + { + auto& target = inserted.first->second; + if (accScore > target.accScore) + { + target = newPath; + } + } + } + + inline void writeTo(Vector>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId) + { + for (auto& p : bestPathes) + { + resultOut.emplace_back(move(p.second)); + auto& newPath = resultOut.back(); + + // fill the rest information of resultOut + newPath.wid = lastSeqId; + if (curMorph->chunks.empty() || curMorph->complex) + { + newPath.combineSocket = curMorph->combineSocket; + newPath.ownFormId = ownFormId; + } + } + } + }; + + template + class BestPathConatiner + { + Vector> bestPathIndicesSmall; + Vector> bestPathValuesSmall; + public: + + inline void clear() + { + bestPathIndicesSmall.clear(); + bestPathValuesSmall.clear(); + } + + inline void insert(const PathHash& ph, size_t topN, uint8_t rootId, + const Morpheme* morph, float accScore, float accTypoCost, const WordLL* parent, LmState&& lmState, SpecialState spState) + { + const auto it = find(bestPathIndicesSmall.begin(), bestPathIndicesSmall.end(), ph); + if (it == bestPathIndicesSmall.end()) + { + bestPathIndicesSmall.push_back(ph); + bestPathValuesSmall.emplace_back(morph, accScore, accTypoCost, parent, move(lmState), spState); + if (rootId != commonRootId) bestPathValuesSmall.back().rootId = rootId; + } + else + { + auto& target = bestPathValuesSmall[it - bestPathIndicesSmall.begin()]; + if (accScore > target.accScore) + { + target = WordLL{ morph, accScore, accTypoCost, parent, move(lmState), spState }; + if (rootId != commonRootId) target.rootId = rootId; + } + } + } + + inline void writeTo(Vector>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId) + { + for (auto& p : bestPathValuesSmall) + { + resultOut.emplace_back(move(p)); + auto& newPath = resultOut.back(); + + // fill the rest information of resultOut + newPath.wid = lastSeqId; + if (curMorph->chunks.empty() || curMorph->complex) + { + newPath.combineSocket = curMorph->combineSocket; + newPath.ownFormId = ownFormId; + } + } + } + }; + + template void PathEvaluator::evalSingleMorpheme( Vector>& resultOut, const Kiwi* kw, const Vector& ownForms, const Vector>>& cache, - array seq, - array oseq, - size_t chSize, - uint8_t combSocket, size_t ownFormId, const Morpheme* curMorph, const KGraphNode* node, @@ -477,37 +645,48 @@ namespace kiwi const Vector& prevSpStates ) { - thread_local UnorderedMap, WordLL> bestPathes; - // pair: [index, size] - thread_local UnorderedMap, pair> bestPathIndex; - thread_local Vector> bestPathValues; + thread_local BestPathConatiner bestPathCont; thread_local Vector rootIds; - if (top1) + + const LangModel& langMdl = kw->langMdl; + const Morpheme* morphBase = kw->morphemes.data(); + const auto spacePenalty = kw->spacePenalty; + const bool allowedSpaceBetweenChunk = kw->spaceTolerance > 0; + + const size_t langVocabSize = langMdl.knlm->getHeader().vocab_size; + + const Morpheme* lastMorph; + Wid firstWid; + if (curMorph->chunks.empty() || curMorph->complex) { - bestPathes.clear(); + lastMorph = curMorph->getCombined() ? curMorph->getCombined() : curMorph; + firstWid = curMorph->lmMorphemeId; } + // if the morpheme has chunk set else { - bestPathIndex.clear(); - bestPathValues.clear(); + lastMorph = curMorph->chunks[curMorph->chunks.size() - 1]; + firstWid = curMorph->chunks[0]->lmMorphemeId; } - const LangModel& langMdl = kw->langMdl; - const Morpheme* morphBase = kw->morphemes.data(); - const auto spacePenalty = kw->spacePenalty; - const bool allowedSpaceBetweenChunk = kw->spaceTolerance > 0; + Wid lastSeqId; + if (within(lastMorph, kw->morphemes.data() + langVocabSize, kw->morphemes.data() + kw->morphemes.size())) + { + lastSeqId = lastMorph - kw->morphemes.data(); + } + else + { + lastSeqId = lastMorph->lmMorphemeId; + } - float additionalScore = curMorph->userScore + nodeLevelDiscount; - additionalScore += kw->tagScorer.evalLeftBoundary(hasLeftBoundary(node), curMorph->tag); - RuleBasedScorer ruleBasedScorer{ kw, curMorph, node }; + bestPathCont.clear(); + const float additionalScore = curMorph->userScore + nodeLevelDiscount + kw->tagScorer.evalLeftBoundary(hasLeftBoundary(node), curMorph->tag); - float discountForCombining = curMorph->combineSocket ? -15 : 0; + RuleBasedScorer ruleBasedScorer{ kw, curMorph, node }; - const size_t vocabSize = langMdl.knlm->getHeader().vocab_size; for (auto* prev = node->getPrev(); prev; prev = prev->getSibling()) { - assert(prev != node); for (auto& prevPath : cache[prev - startNode]) { float candScore = prevPath.accScore + additionalScore; @@ -523,7 +702,7 @@ namespace kiwi if (allowedSpaceBetweenChunk) candScore -= spacePenalty; else continue; } - seq[0] = morphBase[prevPath.wid].getCombined()->lmMorphemeId; + firstWid = morphBase[prevPath.wid].getCombined()->lmMorphemeId; } const kchar_t* leftFormFirst, * leftFormLast; @@ -560,23 +739,32 @@ namespace kiwi } auto cLmState = prevPath.lmState; - Wid lSeq = 0; if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex)) { - lSeq = prevPath.wid; + // no-op } else { - lSeq = seq[chSize - 1]; - for (size_t i = 0; i < chSize; ++i) + if (morphBase[firstWid].tag == POSTag::p) { - if (morphBase[seq[i]].tag == POSTag::p) + // prohibit without + goto continueFor; + } + float ll = cLmState.next(langMdl, firstWid); + candScore += ll; + if (!(curMorph->chunks.empty() || curMorph->complex)) + { + for (size_t i = 1; i < curMorph->chunks.size(); ++i) { - // prohibit without - goto continueFor; + const auto wid = curMorph->chunks[i]->lmMorphemeId; + if (morphBase[wid].tag == POSTag::p) + { + // prohibit without + goto continueFor; + } + ll = cLmState.next(langMdl, wid); + candScore += ll; } - float ll = cLmState.next(langMdl, seq[i]); - candScore += ll; } } @@ -612,103 +800,14 @@ namespace kiwi } PathHash ph{ cLmState, prevPath.rootId, spState }; - if (top1) - { - WordLL newPath{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; - if (rootId != commonRootId) newPath.rootId = rootId; - auto inserted = bestPathes.emplace(ph, newPath); - if (!inserted.second) - { - auto& target = inserted.first->second; - if (candScoreWithRule > target.accScore) - { - target = newPath; - } - } - } - else - { - auto inserted = bestPathIndex.emplace(ph, make_pair((uint32_t)bestPathValues.size(), 1)); - if (inserted.second) - { - bestPathValues.emplace_back(curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState); - if (rootId != commonRootId) bestPathValues.back().rootId = rootId; - bestPathValues.resize(bestPathValues.size() + topN - 1); - } - else - { - auto bestPathFirst = bestPathValues.begin() + inserted.first->second.first; - auto bestPathLast = bestPathValues.begin() + inserted.first->second.first + inserted.first->second.second; - if (distance(bestPathFirst, bestPathLast) < topN) - { - *bestPathLast = WordLL{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; - if (rootId != commonRootId) bestPathLast->rootId = rootId; - push_heap(bestPathFirst, bestPathLast + 1, WordLLGreater{}); - ++inserted.first->second.second; - } - else - { - if (candScoreWithRule > bestPathFirst->accScore) - { - pop_heap(bestPathFirst, bestPathLast, WordLLGreater{}); - *(bestPathLast - 1) = WordLL{ curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState }; - if (rootId != commonRootId) (*(bestPathLast - 1)).rootId = rootId; - push_heap(bestPathFirst, bestPathLast, WordLLGreater{}); - } - } - } - } + bestPathCont.insert(ph, topN, rootId, curMorph, candScoreWithRule, prevPath.accTypoCost + node->typoCost, &prevPath, move(cLmState), spState); } continueFor:; } } - if (top1) - { - for (auto& p : bestPathes) - { - resultOut.emplace_back(move(p.second)); - auto& newPath = resultOut.back(); - - // fill the rest information of resultOut - if (curMorph->chunks.empty() || curMorph->complex) - { - newPath.wid = oseq[0]; - newPath.combineSocket = combSocket; - newPath.ownFormId = ownFormId; - } - else - { - newPath.wid = oseq[chSize - 1]; - } - } - } - else - { - for (auto& p : bestPathIndex) - { - const auto index = p.second.first; - const auto size = p.second.second; - for (size_t i = 0; i < size; ++i) - { - resultOut.emplace_back(move(bestPathValues[index + i])); - auto& newPath = resultOut.back(); - - // fill the rest information of resultOut - if (curMorph->chunks.empty() || curMorph->complex) - { - newPath.wid = oseq[0]; - newPath.combineSocket = combSocket; - newPath.ownFormId = ownFormId; - } - else - { - newPath.wid = oseq[chSize - 1]; - } - } - } - } + bestPathCont.writeTo(resultOut, curMorph, lastSeqId, ownFormId); return; } @@ -747,6 +846,13 @@ namespace kiwi const float nodeLevelDiscount = whitespaceDiscount + typoDiscount + unknownFormDiscount; + size_t totalPrevPathes = 0; + for (auto* prev = node->getPrev(); prev; prev = prev->getSibling()) + { + totalPrevPathes += cache[prev - startNode].size(); + } + const bool useContainerForSmall = totalPrevPathes <= 48; + for (bool ignoreCond : {false, true}) { for (auto& curMorph : cands) @@ -775,16 +881,9 @@ namespace kiwi continue; } - array seq = { 0, }; - array oseq = { 0, }; - uint8_t combSocket = 0; - CondVowel condV = curMorph->vowel; - CondPolarity condP = curMorph->polar; - size_t chSize = 1; // if the morpheme has chunk set - if (!curMorph->chunks.empty() && !curMorph->complex) + if (!(curMorph->chunks.empty()|| curMorph->complex)) { - chSize = curMorph->chunks.size(); // '하다/하게/하지'가 '다/게/지'로 축약된 경우인데 앞에 공백이 있는 경우는 탐색후보에서 제외 if (node->prev && node[-(int)node->prev].endPos < node->startPos && curMorph->kform @@ -796,44 +895,24 @@ namespace kiwi { continue; } - - for (size_t i = 0; i < chSize; ++i) - { - seq[i] = curMorph->chunks[i]->lmMorphemeId; - if (within(curMorph->chunks[i], kw->morphemes.data() + langVocabSize, kw->morphemes.data() + kw->morphemes.size())) - { - oseq[i] = curMorph->chunks[i] - kw->morphemes.data(); - } - else - { - oseq[i] = seq[i]; - } - } } - else + + if (topN > 1) { - seq[0] = curMorph->lmMorphemeId; - if (within(curMorph->getCombined() ? curMorph->getCombined() : curMorph, kw->morphemes.data() + langVocabSize, kw->morphemes.data() + kw->morphemes.size())) - { - oseq[0] = curMorph - kw->morphemes.data(); - } - else - { - oseq[0] = seq[0]; - } - combSocket = curMorph->combineSocket; + evalSingleMorpheme(nCache, kw, ownFormList, cache, + ownFormId, curMorph, + node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount, prevSpStates); } - - if (topN == 1) + else if (useContainerForSmall) { - evalSingleMorpheme(nCache, kw, ownFormList, cache, - seq, oseq, chSize, combSocket, ownFormId, curMorph, + evalSingleMorpheme(nCache, kw, ownFormList, cache, + ownFormId, curMorph, node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount, prevSpStates); } else { - evalSingleMorpheme(nCache, kw, ownFormList, cache, - seq, oseq, chSize, combSocket, ownFormId, curMorph, + evalSingleMorpheme(nCache, kw, ownFormList, cache, + ownFormId, curMorph, node, startNode, topN, ignoreCond ? -10 : 0, nodeLevelDiscount, prevSpStates); } From 83b205d0e55ddaf1e1abbf56730e8035f84bdf5a Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 30 Sep 2024 00:50:04 +0900 Subject: [PATCH 10/12] add more alias to morpheme definitions --- ModelGenerator/morphemes.txt | 198 ++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 97 deletions(-) diff --git a/ModelGenerator/morphemes.txt b/ModelGenerator/morphemes.txt index 683afe35..ffd0b624 100644 --- a/ModelGenerator/morphemes.txt +++ b/ModelGenerator/morphemes.txt @@ -196,7 +196,7 @@ 가지 VV 15290 미국 NNP 15195 나 JC 15177 vowel 0.9992366941919367 ==이나 -그녀 NP 14983 +그녀 NP 14983 =그 싶 VX 14945 다가 EC 14849 이제 MAG 14782 @@ -216,7 +216,7 @@ 저 NP 13862 다음 NNG 13816 지나 VV 13552 -얘기 NNG 13539 +얘기 NNG 13539 =이야기 가장 MAG 13287 ᆫ데 EC 13236 vocalic 못 MAG 13184 @@ -1023,7 +1023,7 @@ 빨리 MAG 2866 원인 NNG 2853 사라지 VV 2853 -율 XSN 2851 +율 XSN 2851 =률 멀 VA 2843 건물 NNG 2832 민주당 NNP 2829 @@ -1222,7 +1222,7 @@ 벌어지 VV 2335 complex 벌/VV 어/EC 지/VX 011223 느냐 EC 2334 구나 EF 2329 -근까 MAJ 2328 +근까 MAJ 2328 =그러니까 위원 NNG 2326 려면 EC 2323 vocalic 아저씨 NNG 2322 @@ -2414,7 +2414,7 @@ 중학교 NNG 1020 거짓말 NNG 1019 취급 NNG 1018 -그까 MAJ 1015 +그까 MAJ 1015 =그러니까 계시 VX 1015 가방 NNG 1015 맨 MM 1014 @@ -2607,7 +2607,7 @@ 입시 NNG 914 치 XSN 913 닫 VV-R 913 -닫 VV-I 913 =달리 +닫 VV-I 913 =달리/VV 재밌 VA 912 가수 NNG 912 연구원 NNG 911 @@ -2843,7 +2843,7 @@ 수집 NNG 820 태양 NNG 820 대단 XR 820 -머물 VV 819 +머물 VV 819 =머무르 한꺼번에 MAG 819 맥주 NNG 819 그림자 NNG 819 @@ -2982,7 +2982,7 @@ 뚜렷 XR 773 한숨 NNG 773 여기저기 NNG 772 -고는 EC 772 +고는 EC 772 =곤 압박 NNG 772 변경 NNG 772 지하철 NNG 772 @@ -4393,7 +4393,7 @@ 피고인 NNG 465 자각 NNG 465 닥치 VV 465 -요즈음 NNG 465 +요즈음 NNG 465 =요즘 흘러나오 VV 465 complex 흐르/VV 어/EC 나오/VV 011224 오랜만 NNG 464 다듬 VV 464 @@ -4630,7 +4630,7 @@ 명시 NNG 431 운행 NNG 431 절약 NNG 431 -쪼끔 MAG 430 +쪼끔 MAG 430 =조금 감기 NNG 430 배상 NNG 430 변명 NNG 430 @@ -6325,7 +6325,7 @@ 명사 NNG 273 촌 NNG 273 분화 NNG 273 -임마 IC 273 +임마 IC 273 ==인마 우뚝 MAG 273 기단 NNG 273 참가자 NNG 273 @@ -6671,7 +6671,7 @@ 헌신 NNG 251 벌 NNB 251 노트 NNG 251 -시커멓 VA-I 251 +시커멓 VA-I 251 =새카맣 급식 NNG 251 만기 NNG 251 아카데미 NNG 251 @@ -6724,7 +6724,7 @@ 저승 NNG 249 에미 NNG 249 본격 XR 249 -쪼금 MAG 248 +쪼금 MAG 248 =조금 성의 NNG 248 총각 NNG 248 산문 NNG 248 @@ -7710,7 +7710,7 @@ 고국 NNG 202 반란 NNG 202 용이 XR 202 -똘스또이 NNP 202 +똘스또이 NNP 202 =톨스토이 고춧가루 NNG 201 치안 NNG 201 내정 NNG 201 @@ -9001,7 +9001,7 @@ 독립운동 NNG 159 불우 NNG 159 국왕 NNG 159 -허허 IC 159 +허허 IC 159 =하하 영세 NNG 159 땅콩 NNG 159 교민 NNG 159 @@ -9037,7 +9037,7 @@ 폐단 NNG 159 시책 NNG 159 니나 NNP 159 -맑스 NNP 159 +맑스 NNP 159 =마르크스 소 EC 159 행수 NNG 159 풋 XPN 158 @@ -9232,7 +9232,7 @@ 미신 NNG 153 더디 VA 153 메마르 VA 153 -서툴 VA 153 +서툴 VA 153 =서투르 강국 NNG 153 겨자 NNG 153 잠바 NNG 153 @@ -9560,7 +9560,7 @@ 저버리 VV 145 되돌아보 VV 145 complex 되돌/VV 어/EC 보/VX 022334 분당 NNP 145 -하하하 IC 145 +하하하 IC 145 =하하 빅 NNG 145 베개 NNG 145 우르르 MAG 145 @@ -11048,7 +11048,7 @@ LG전자 NNP 119 계 NNG 116 저녁때 NNG 116 그만 XR 116 -쭈그리 VV 116 +쭈그리 VV 116 =쪼그리 남동생 NNG 116 존슨 NNP 116 미셸 NNP 116 @@ -11114,7 +11114,7 @@ LG전자 NNP 119 지팡이 NNG 115 하드 NNG 115 자판 NNG 115 -깜깜 MAG 115 +깜깜 MAG 115 =캄캄 밥그릇 NNG 115 학교장 NNG 115 이양 NNG 115 @@ -11705,7 +11705,7 @@ LG전자 NNP 119 만수 NNP 107 파면 NNG 107 염 NNG 107 -환히 MAG 107 complex 환/XR 히/XSM 0112 +환히 MAG 107 complex 환/XR 히/XSM 0112 =훤히 찻집 NNG 107 밤나무 NNG 107 핥 VV 107 @@ -11721,7 +11721,7 @@ LG전자 NNP 119 교내 NNG 106 본받 VV-R 106 complex 본/NNG 받/XSV 0112 트렌드 NNG 106 -어휴 IC 106 +어휴 IC 106 =아휴 관공서 NNG 106 주종 NNG 106 냉면 NNG 106 @@ -11809,7 +11809,7 @@ LG전자 NNP 119 쌀밥 NNG 105 재심 NNG 105 장막 NNG 105 -구먼 EF 105 +구먼 EF 105 =구만 감명 NNG 105 축 MAG 105 중심주의 NNG 105 @@ -11988,7 +11988,7 @@ LG전자 NNP 119 소관 NNG 103 항거 NNG 103 작아지 VV 103 complex 작/VA 어/EC 지/VX 011223 -치루 VV 103 +치루 VV 103 =치르 번성 NNG 103 경제부 NNG 103 인류학 NNG 103 @@ -12431,7 +12431,7 @@ SK텔레콤 NNP 103 신변 NNG 98 쬐 VV 98 패널 NNG 98 -죄 VV 98 +죄 VV 98 =조이 정가 NNG 98 한용운 NNP 98 대리석 NNG 98 @@ -13894,7 +13894,7 @@ SK텔레콤 NNP 103 조직위 NNG 82 증액 NNG 82 가차 NNG 82 -한-미 NNP 82 +한-미 NNP 82 =한·미 강호 NNG 82 인기척 NNG 82 천민 NNG 82 @@ -14628,7 +14628,7 @@ LG화학 NNP 82 잡스 NNP 75 회초리 NNG 75 박쥐 NNG 75 -코퍼스 NNG 75 +코퍼스 NNG 75 =말뭉치 에스 NNG 75 회계사 NNG 75 툭하면 MAG 75 @@ -16024,7 +16024,7 @@ LG화학 NNP 82 ᆯ려 EC 64 vocalic =려 예민하 VA 64 complex 예민/XR 하/XSA 0223 배이 VV 64 -그든 EF 64 +그든 EF 64 =거든 뚝딱 MAG 64 고대로 MAG 64 단백 NNG 64 @@ -16101,7 +16101,7 @@ LG화학 NNP 82 이상득 NNP 64 대치동 NNP 64 꿀벌 NNG 64 -쏘이 VV 64 +쏘이 VV 64 =쐬 공과 NNG 64 기관사 NNG 64 만만찮 VA 64 @@ -16173,7 +16173,7 @@ LG화학 NNP 82 주철이 NNP 64 박일채 NNP 64 쇠도끼 NNG 64 -왜냐면은 MAJ 64 +왜냐면은 MAJ 64 =왜냐하면 겉모습 NNG 63 홈즈 NNP 63 유사시 NNG 63 @@ -16822,7 +16822,7 @@ LG화학 NNP 82 금세기 NNG 60 수초 NNG 60 코카콜라 NNG 60 -늘씬 XR 60 +늘씬 XR 60 =날씬 마르크스주의 NNG 60 노련 XR 60 으스대 VV 60 @@ -17735,7 +17735,7 @@ LG화학 NNP 82 흠칫 MAG 55 더우기 MAG 55 정 XPN 55 -힐끔 MAG 55 +힐끔 MAG 55 =힐끗 보아하니 MAG 55 어린것 NNG 55 카이주아 NNP 55 @@ -17749,9 +17749,9 @@ LG화학 NNP 82 세현 NNP 55 하트 NNG 54 기우 NNG 54 -쪼그맣 VA-I 54 +쪼그맣 VA-I 54 =조그맣 하이라이트 NNG 54 -그러믄 MAJ 54 +그러믄 MAJ 54 =그러면 달달 MAG 54 깊어지 VV 54 complex 깊/VA 어/EC 지/VX 011223 승부수 NNG 54 @@ -17862,7 +17862,7 @@ LG화학 NNP 82 송 NNG 54 불룩 MAG 54 외계 NNG 54 -꾀 VV 54 +꾀 VV 54 =꼬이 행선지 NNG 54 고리 NNP 54 아디다스 NNP 54 @@ -18603,7 +18603,7 @@ LG화학 NNP 82 유쾌 NNG 50 덜덜 MAG 50 비리 VA 50 -쪼금 NNG 50 +쪼금 NNG 50 =조금 유나이티드 NNP 50 직공 NNG 50 불발 NNG 50 @@ -19166,7 +19166,7 @@ AP통신 NNP 50 얼개 NNG 48 밑줄 NNG 48 포도당 NNG 48 -한-일 NNP 48 +한-일 NNP 48 =한·일 월터 NNP 48 볼티모어 NNP 48 방중 NNG 48 @@ -19536,7 +19536,7 @@ AP통신 NNP 50 어려워하 VV 46 complex 어렵/VA-I 어/EC 하/VX 032334 패밀리 NNG 46 이북 NNG 46 -쪼끄맣 VA-I 46 +쪼끄맣 VA-I 46 =조그맣 쭈꾸미 NNG 46 묵히 VV 46 처칠 NNP 46 @@ -19796,7 +19796,7 @@ AP통신 NNP 50 칸딘스키 NNP 46 박상필 NNP 46 타우너 NNP 46 -하아 IC 45 +하아 IC 45 =허어 껑충 MAG 45 소쿠리 NNG 45 여 NP 45 @@ -20081,7 +20081,7 @@ AP통신 NNP 50 흘금 MAG 45 쇼 EF 45 vocalic 으쇼 EF 45 non_vowel ==쇼 -맑스주의 NNG 45 +맑스주의 NNG 45 =마르크스주의 경제주의 NNG 45 서하총 NNP 45 상호 NNP 45 @@ -20551,7 +20551,7 @@ AP통신 NNP 50 기자단 NNG 43 백두 NNP 43 골리앗 NNP 43 -쪼이 VV 43 +쪼이 VV 43 =쬐 고관 NNG 43 한군데 NNG 43 연극제 NNG 43 @@ -20560,7 +20560,7 @@ AP통신 NNP 50 기형아 NNG 43 법문 NNG 43 미담 NNG 43 -귀걸이 NNG 43 +귀걸이 NNG 43 =귀고리 거뜬히 MAG 43 complex 거뜬/XR 히/XSM 0223 노라니 EC 43 동호인 NNG 43 @@ -20619,7 +20619,7 @@ AP통신 NNP 50 대밭 NNG 43 맡아보 VV 43 complex 맡/VV 어/EC 보/VX 011223 는데도 EC 43 non_adj -발갛 VA-I 43 +발갛 VA-I 43 =벌겋 선량 XR 43 말뜻 NNG 43 고야 EC 43 @@ -21228,7 +21228,7 @@ AP통신 NNP 50 열한 NR 41 안사람 NNG 41 래 EC 41 vocalic -길다랗 VA-I 41 +길다랗 VA-I 41 =기다랗 던들 EC 41 덧술 NNG 41 영롱 XR 41 @@ -21632,7 +21632,7 @@ AP통신 NNP 50 원체 MAG 39 시급 NNG 39 설전 NNG 39 -라뇨 EF 39 +라뇨 EF 39 ==라니요 수필집 NNG 39 봉합 NNG 39 다음 NNP 39 @@ -22199,7 +22199,7 @@ GS칼텍스 NNP 39 시발 NNG 38 답하 VV 38 complex 답/NNG 하/XSV 0112 토플러 NNP 38 -금새 MAG 38 +금새 MAG 38 =금세 섶 NNG 38 저곳 NP 38 남극 NNP 38 @@ -22438,7 +22438,7 @@ GS칼텍스 NNP 39 늑장 NNG 37 화성행궁 NNP 37 격하 NNG 37 -한-중 NNP 37 +한-중 NNP 37 =한·중 실익 NNG 37 뒷좌석 NNG 37 흩날리 VV 37 @@ -22560,8 +22560,9 @@ GS칼텍스 NNP 39 달래 NNP 37 나나니 NNG 37 뻔하 VX 37 -대메 EF 37 +대메 EF 37 =다며 량 XSN 37 +양 XSN 37 =량 박연차 NNP 36 다자 NNG 36 따루 MAG 36 =따로 @@ -23049,7 +23050,7 @@ GS칼텍스 NNP 39 웃음거리 NNG 35 낙오 NNG 35 손재주 NNG 35 -스탠다드 NNG 35 +스탠다드 NNG 35 =스탠더드 당권 NNG 35 인류사 NNG 35 패드 NNG 35 @@ -23697,7 +23698,7 @@ GS칼텍스 NNP 39 MBC애드컴 NNP 34 보충식 NNG 34 필요없 VA 34 complex 필요/NNG 없/VA 0223 -근까는 MAJ 34 +근까는 MAJ 34 =그러니까는 라면은 EC 34 싸운드 NNG 34 익스플로어 NNP 34 @@ -24373,7 +24374,7 @@ SK그룹 NNP 33 발상지 NNG 32 마이어 NNP 32 옛적 NNG 32 -서둘 VV 32 +서둘 VV 32 =서두르 되받 VV-R 32 체르노빌 NNP 32 풍랑 NNG 32 @@ -24453,7 +24454,7 @@ SK그룹 NNP 33 하교 NNG 32 구구 XR 32 베트콩 NNP 32 -케인즈 NNP 32 +케인즈 NNP 32 =케인스 여인상 NNG 32 쪽마루 NNG 32 눅눅 XR 32 @@ -24701,6 +24702,7 @@ SK그룹 NNP 33 울트라 NNG 31 방위비 NNG 31 라스베이거스 NNP 31 +라스베가스 NNP 5 ==라스베이거스 환심 NNG 31 술판 NNG 31 수원지법 NNP 31 @@ -25509,7 +25511,7 @@ SK그룹 NNP 33 바디 NNG 29 임플란트 NNG 29 음음음 IC 29 -컨퍼런스 NNG 29 +컨퍼런스 NNG 29 =콘퍼런스 상치 NNG 29 몽매 NNG 29 개략 NNG 29 @@ -27485,14 +27487,14 @@ CJ그룹 NNP 29 소인 NP 26 도방 NNG 26 시다 EF 26 -여보시오 IC 26 +여보시오 IC 26 =여보세요 후후 MAG 26 깡다구 NNG 26 광수 NNP 26 이시호우 NNP 26 치마꼬리 NNG 26 혜정이 NNP 26 -가엽 VA-I 26 +가엽 VA-I 26 =가엾/VA 백하 NNP 26 로버릭 NNP 26 혁진이 NNP 26 @@ -27600,7 +27602,7 @@ CJ그룹 NNP 29 봉천동 NNP 25 긴팔 NNG 25 깜짝깜짝 MAG 25 -삼가하 VV 25 complex 삼가/MAG 하/XSV 0223 +삼가하 VV 25 complex 삼가/MAG 하/XSV 0223 =삼가 맛집 NNG 25 둘째 NNG 25 학원비 NNG 25 @@ -29318,7 +29320,7 @@ LG유플러스 NNP 24 윗물 NNG 23 흑룡강 NNP 23 이운영 NNP 23 -구먼 EC 23 +구먼 EC 23 =구만 미토콘드리아 NNG 23 전지전능 NNG 23 분화구 NNG 23 @@ -29386,7 +29388,7 @@ LG유플러스 NNP 24 촌사람 NNG 23 아랫방 NNG 23 팥고물 NNG 23 -깜깜 XR 23 +깜깜 XR 23 =캄캄 어둠침침 XR 23 옥양목 NNG 23 상거 NNG 23 @@ -29530,7 +29532,7 @@ LG유플러스 NNP 24 폭포수 NNG 22 ᆫ다라는 ETM 22 vocalic 티벳 NNP 22 =티베트 -하하하하 IC 22 +하하하하 IC 22 =하하 성차별 NNG 22 비제이 NNG 22 동일본 NNP 22 @@ -29638,7 +29640,7 @@ LG유플러스 NNP 24 덧나 VV 22 데이타베이스 NNG 22 당명 NNG 22 -되려 MAG 22 +되려 MAG 22 =되레 백골 NNG 22 창작물 NNG 22 목간 NNG 22 @@ -29814,7 +29816,7 @@ LG유플러스 NNP 24 여 NNP 22 우윤근 NNP 22 노조법 NNG 22 -뒷얘기 NNG 22 +뒷얘기 NNG 22 =뒷이야기 전승국 NNG 22 시벨리우스 NNP 22 엉덩방아 NNG 22 @@ -30219,7 +30221,7 @@ LG유플러스 NNP 24 면서두 EC 22 뭐뭐 NP 22 씨그마 NNG 22 -그든요 EF 22 complex 그든/EF 요/JX 0223 +그든요 EF 22 complex 그든/EF 요/JX 0223 =거든요 탐크루즈 NNP 22 언어병리학 NNG 22 이퀄라이저 NNG 22 @@ -30236,7 +30238,7 @@ LG유플러스 NNP 24 이십칠 NR 21 정사각형 NNG 21 함 MAG 21 -쪼끔 NNG 21 +쪼끔 NNG 21 =조금 불한당 NNG 21 다산초당 NNP 21 정호성 NNP 21 @@ -30662,7 +30664,8 @@ LG유플러스 NNP 24 장쩌민 NNP 21 까막눈 NNG 21 친북 NNG 21 -메탄 NNG 21 +메테인 NNG 21 +메탄 NNG 5 =메테인 자금법 NNG 21 해머 NNG 21 라덴 NNP 21 @@ -30935,7 +30938,7 @@ LG유플러스 NNP 24 마한 NNP 21 룸펜 NNG 21 승방 NNG 21 -다뇨 EF 21 +다뇨 EF 21 ==다니요 그러나저러나 MAG 21 용의주도 XR 21 장시 NNG 21 @@ -31018,7 +31021,7 @@ LG유플러스 NNP 24 구명완 NNP 21 태극전사 NNG 21 슴슴하 VA 20 -여쭙 VV-I 20 +여쭙 VV-I 20 =여쭈/VV 던지 JX 20 동경대학 NNP 20 이천칠 NR 20 @@ -31311,7 +31314,7 @@ LG유플러스 NNP 24 약혼녀 NNG 20 회항 NNG 20 톈진 NNP 20 -버마 NNP 20 +버마 NNP 20 =미얀마 수도자 NNG 20 미주 NNG 20 카운트다운 NNG 20 @@ -32017,7 +32020,7 @@ LG디스플레이 NNP 20 김하늘 NNP 19 알로에 NNG 19 병가 NNG 19 -리모콘 NNG 19 +리모콘 NNG 19 =리모컨 권사 NNG 19 기름값 NNG 19 정범모 NNP 19 @@ -33804,7 +33807,7 @@ SH공사 NNP 19 찜기 NNG 17 육이오 NNG 17 이천오 NR 17 -그르면 MAJ 17 +그르면 MAJ 17 =그러면 구십오 NR 17 지저분하 VA 17 complex 지저분/XR 하/XSA 0334 끄내 VV 17 @@ -38395,7 +38398,7 @@ D사 NNG 14 땅강아지 NNG 14 뒤러 NNP 14 사해 NNG 14 -쬐그맣 VA-I 14 +쬐그맣 VA-I 14 =조그맣 상두 NNG 14 줌 NNB 14 건장 NNG 14 @@ -38676,7 +38679,7 @@ X선 NNG 14 널널 XR 14 당 NNB 14 빳데리 NNG 14 -왜냐하면은 MAJ 14 +왜냐하면은 MAJ 14 =왜냐하면 명현 NNP 14 목표어 NNG 14 왕주 NNP 14 @@ -41019,7 +41022,7 @@ SC제일은행 NNP 12 감소증 NNG 12 총포 NNG 12 중죄 NNG 12 -웨이트리스 NNG 12 +웨이트리스 NNG 12 ==웨이터 민주파 NNG 12 영구불변 NNG 12 더빙 NNG 12 @@ -42043,8 +42046,8 @@ NG NNP 12 유스호스텔 NNP 12 비평자 NNG 12 대놓 VV 12 -그까는 MAJ 12 -ᆫ대메 EF 12 vocalic +그까는 MAJ 12 =그러니까는 +ᆫ대메 EF 12 vocalic =ᆫ다며 얼마 MAG 12 블 VX 12 집어늫 VV 12 @@ -42191,7 +42194,7 @@ NG NNP 12 화사하 VA 11 complex 화사/XR 하/XSA 0223 채 XSN 11 비등 NNG 11 -떨어트리 VV 11 complex 떨/VV 어/EC 트리/XSV 011224 +떨어트리 VV 11 complex 떨/VV 어/EC 트리/XSV 011224 =떨어뜨리 김종대 NNP 11 김관영 NNP 11 문세광 NNP 11 @@ -42313,7 +42316,7 @@ NG NNP 12 장의사 NNG 11 수료식 NNG 11 접붙이 VV 11 -더구먼 EF 11 +더구먼 EF 11 =더구만 완판 NNG 11 엑스 NNP 11 폴리티컬 NNG 11 @@ -43992,7 +43995,7 @@ IMF NNP 11 걸립 NNG 11 안악 NNP 11 차지 NNP 11 -오르가즘 NNG 11 +오르가즘 NNG 11 =오르가슴 모르그 NNP 11 아이벨 NNP 11 페넬 NNP 11 @@ -44351,7 +44354,7 @@ IMF NNP 11 한적하 VA 10 complex 한적/XR 하/XSA 0223 들썩들썩 MAG 10 톡 NNG 10 -쬐끔 MAG 10 +쬐끔 MAG 10 =조금 돼지띠 NNG 10 여직 MAG 10 떨이 NNG 10 @@ -44647,7 +44650,7 @@ IMF NNP 11 영업직 NNG 10 레디 NNG 10 이정우 NNP 10 -얘기꽃 NNG 10 +얘기꽃 NNG 10 =이야기꽃 비디오물 NNG 10 발전법 NNG 10 아라미드 NNG 10 @@ -45959,7 +45962,7 @@ JP모건 NNP 10 한국교육원 NNP 10 중구 NNG 10 프랙탈 NNG 10 -노르스름 XR 10 +노르스름 XR 10 =누르스름 땟국물 NNG 10 복숭아나무 NNG 10 강평 NNG 10 @@ -48268,7 +48271,7 @@ SK케미칼 NNP 9 액세스 NNG 9 리골레토 NNP 9 개량종 NNG 9 -이야기꽃 NNG 9 +이야기꽃 NNG 10 여운계 NNP 9 다슬이 NNP 9 성현아 NNP 9 @@ -48346,7 +48349,7 @@ SK케미칼 NNP 9 스크럼 NNG 9 한국일보사 NNP 9 삼인칭 NNG 9 -쬐끄맣 VA-I 9 +쬐끄맣 VA-I 9 =조그맣 시정신 NNG 9 몰아대 VV 9 complex 몰/VV 어/EC 대/VX 011223 비음 NNG 9 @@ -48765,7 +48768,7 @@ ABC NNP 9 상머리 NNG 9 하이칼라 NNG 9 야물 VA 9 -껌껌 XR 9 +껌껌 XR 9 =컴컴 노동판 NNG 9 조용조용히 MAG 9 complex 조용조용/MAG 히/XSM 0445 장작더미 NNG 9 @@ -49731,7 +49734,7 @@ ABC NNP 9 바이오산업 NNG 8 법고창신 NNG 8 스테이 NNG 8 -콩쿨 NNG 8 +콩쿨 NNG 8 =콩쿠르 수궁가 NNP 8 헌화가 NNP 8 맵핑 NNG 8 @@ -51568,7 +51571,8 @@ T맵 NNP 8 망극 XR 8 다발 NNB 8 옴쭉 MAG 8 -로구먼 EF 8 +로구만 EF 10 +로구먼 EF 8 =로구만 품관 NNG 8 기어들어가 VV 8 complex 기어들/VV 어/EC 가/VX 033445 곤룡포 NNG 8 @@ -51944,7 +51948,7 @@ YMCA NNP 8 배면 NNG 8 헤벌리 VV 8 complex 헤/MAG 벌리/VV 0113 씹 NNG 8 -딴딴 XR 8 +딴딴 XR 8 =탄탄 완행버스 NNG 8 남방셔츠 NNG 8 써레 NNG 8 @@ -52027,7 +52031,7 @@ YMCA NNP 8 치솔 NNG 8 동경전기 NNP 8 포기 NNB 8 -맑스주의자 NNG 8 +맑스주의자 NNG 8 =마르크스주의자 마이푸즈 NNP 8 고종 NNG 8 서동근 NNP 8 @@ -52839,7 +52843,7 @@ YMCA NNP 8 홍종호 NNP 7 죽전 NNP 7 루저 NNG 7 -그라믄 MAJ 7 +그라믄 MAJ 7 =그러면 ᆯ런지 EF 10 vocalic 나랏님 NNG 7 쏘크라테스 NNP 7 @@ -52952,7 +52956,7 @@ YMCA NNP 8 사성 NNG 7 미다 NNB 7 예방약 NNG 7 -퍼트리 VV 7 +퍼트리 VV 7 =퍼뜨리 손경식 NNP 7 알짜배기 NNG 7 쫄바지 NNG 7 @@ -54761,7 +54765,7 @@ OB맥주 NNP 7 홍채학 NNG 7 산도스사 NNP 7 아침잠 NNG 7 -에딘버러 NNP 7 +에딘버러 NNP 7 =에든버러 여과기 NNG 7 소화액 NNG 7 지루 NNG 7 @@ -57172,7 +57176,7 @@ address2 NNP 6 끝자락 NNG 6 젤라틴 NNG 6 매캐하 VA 6 complex 매캐/XR 하/XSA 0223 -쬐끔 NNG 6 +쬐끔 NNG 6 =조금 토코페롤 NNG 6 햐아 IC 6 고등어조림 NNG 6 @@ -59729,7 +59733,7 @@ ISL코리아 NNP 6 징글 NNG 6 갱스터 NNG 6 구석차기 NNG 6 -쭈그러들 VV 6 +쭈그러들 VV 6 =쪼그라들 불명자 NNG 6 부교 NNG 6 홍재형 NNP 6 @@ -65486,7 +65490,7 @@ SK바이오팜 NNP 5 김만수 NNP 5 로티 NNP 5 팬레터 NNG 5 -힐즈 NNP 5 +힐즈 NNP 5 =힐스 임순례 NNP 5 오키 NNP 5 정혼 NNG 5 @@ -66689,7 +66693,7 @@ KBS NNP 5 지달리 VV 5 독허 VA 5 =독하 수연 NNG 5 -쬐금 NNG 5 +쬐금 NNG 5 =조금 멧비둘기 NNG 5 ᆫ규 EF 5 vocalic 남섬부주 NNG 5 @@ -66827,7 +66831,7 @@ KBS NNP 5 떡살 NNG 5 카폰 NNG 5 래프트 NNP 5 -다는구만 EF 5 +다는구만 EF 10 플로트 NNG 5 헌칠 XR 5 레오폴드 NNP 5 @@ -66999,7 +67003,7 @@ KBS NNP 5 무심중 NNG 5 조명진 NNP 5 피래미 NNG 5 -다는구먼 EF 5 +다는구먼 EF 5 =다는구만 넘차 IC 5 성숙이 NNP 5 쫄랑쫄랑 MAG 5 @@ -68574,7 +68578,7 @@ CNN NNP 5 경솔히 MAG 5 complex 경솔/NNG 히/XSM 0223 극렬히 MAG 5 complex 극렬/NNG 히/XSM 0223 능숙히 MAG 5 complex 능숙/XR 히/XSM 0223 -뚜렷히 MAG 5 complex 뚜렷/XR 히/XSM 0223 +뚜렷히 MAG 5 complex 뚜렷/XR 히/XSM 0223 =뚜렷이 어정쩡히 MAG 5 complex 어정쩡/XR 히/XSM 0334 자상히 MAG 5 complex 자상/XR 히/XSM 0223 틈틈히 MAG 5 =틈틈이 From 1d1d6bc3fbda43e0f198f317f4ab3a3cbf6e6ea4 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 30 Sep 2024 00:50:31 +0900 Subject: [PATCH 11/12] Update test cases --- test/test_cpp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_cpp.cpp b/test/test_cpp.cpp index b78e3f79..b47d4ba9 100644 --- a/test/test_cpp.cpp +++ b/test/test_cpp.cpp @@ -440,7 +440,7 @@ TEST(KiwiCpp, SentenceBoundaryErrors) u8"어떻게 보면 신제품에 대한 기대 이런 모멘텀들이 국내 증시의 적감의 수세를 촉발시킬 수도 있는 요인이 될 수도 있다라고 보시면 될 것 같습니다.", u8"관련 법령 이전에 만들어져 경사로 설치 의무 대상은 아닙니다.", u8"적법절차의 실질적인 내용을 침해하였는지 여부 등에 관하여 충분히 심리하지", - u8"2023. 5. 10 주식회사 키위(이하 '회사'라 한다) 대표이사 XXX는 저녁을 직원들에게 사주었다.", + u8"2023. 5. 10 주식회사 키위(이하 '회사'라 한다)의 대표이사 XXX는 저녁을 직원들에게 사주었다.", u8"실패할까봐", u8"집에 갈까 봐요", u8"너무 낮지 싶어요", From d8740fc0c4412b476d7f2abff46f47fadc2e68f8 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 30 Sep 2024 00:51:24 +0900 Subject: [PATCH 12/12] Add `ltypo` arg to evaluator --- tools/evaluator_main.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tools/evaluator_main.cpp b/tools/evaluator_main.cpp index 84a049f8..ea4ee1c3 100644 --- a/tools/evaluator_main.cpp +++ b/tools/evaluator_main.cpp @@ -13,12 +13,12 @@ using namespace kiwi; int doEvaluate(const string& modelPath, const string& output, const vector& input, bool normCoda, bool zCoda, bool multiDict, bool useSBG, - float typoCostWeight, bool bTypo, bool cTypo, + float typoCostWeight, bool bTypo, bool cTypo, bool lTypo, int repeat) { try { - if (typoCostWeight > 0 && !bTypo && !cTypo) + if (typoCostWeight > 0 && !bTypo && !cTypo && !lTypo) { bTypo = true; } @@ -26,14 +26,30 @@ int doEvaluate(const string& modelPath, const string& output, const vector 0) kw.setTypoCostWeight(typoCostWeight); @@ -118,6 +134,7 @@ int main(int argc, const char* argv[]) ValueArg typoWeight{ "", "typo", "typo weight", false, 0.f, "float"}; SwitchArg bTypo{ "", "btypo", "make basic-typo-tolerant model", false }; SwitchArg cTypo{ "", "ctypo", "make continual-typo-tolerant model", false }; + SwitchArg lTypo{ "", "ltypo", "make lengthening-typo-tolerant model", false }; ValueArg repeat{ "", "repeat", "repeat evaluation for benchmark", false, 1, "int" }; UnlabeledMultiArg files{ "files", "evaluation set files", true, "string" }; @@ -131,6 +148,7 @@ int main(int argc, const char* argv[]) cmd.add(typoWeight); cmd.add(bTypo); cmd.add(cTypo); + cmd.add(lTypo); cmd.add(repeat); try @@ -143,6 +161,6 @@ int main(int argc, const char* argv[]) return -1; } return doEvaluate(model, output, files.getValue(), - !noNormCoda, !noZCoda, !noMulti, useSBG, typoWeight, bTypo, cTypo, repeat); + !noNormCoda, !noZCoda, !noMulti, useSBG, typoWeight, bTypo, cTypo, lTypo, repeat); }