Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SB errors & Optimize path search #187

Merged
merged 12 commits into from
Oct 3, 2024
198 changes: 101 additions & 97 deletions ModelGenerator/morphemes.txt

Large diffs are not rendered by default.

45 changes: 45 additions & 0 deletions include/kiwi/SubstringExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,49 @@ namespace kiwi
size_t cluster(size_t i) const;
float score(size_t i) const;
};

class Kiwi;

class NgramExtractor
{
const Kiwi* kiwi = nullptr;
bool gatherLmScore = true;
UnorderedMap<std::u16string, size_t> morph2id;
Vector<std::u16string> id2morph;
Vector<uint16_t> buf;
Vector<int16_t> scores;
Vector<size_t> docBoundaries;
Vector<uint32_t> positions;
Vector<std::u16string> rawDocs;

size_t addTokens(const std::vector<TokenInfo>& tokens);

public:
struct Candidate
{
std::u16string text;
std::vector<std::u16string> tokens;
std::vector<float> tokenScores;
size_t cnt = 0;
size_t df = 0;
float score = 0;
float npmi = 0;
float leftBranch = 0;
float rightBranch = 0;
float lmScore = 0;
};

NgramExtractor();
NgramExtractor(const Kiwi& kiwi, bool gatherLmScore = true);
NgramExtractor(const NgramExtractor&);
NgramExtractor(NgramExtractor&&) noexcept;
NgramExtractor& operator=(const NgramExtractor&);
NgramExtractor& operator=(NgramExtractor&&) noexcept;
~NgramExtractor();

size_t addText(const std::u16string& text);
size_t addTexts(const U16Reader& reader);

std::vector<Candidate> extract(size_t maxCandidates = 1000, size_t minCnt = 10, size_t maxLength = 5, float minScore = 1e-3, size_t numWorkers = 1) const;
};
}
12 changes: 1 addition & 11 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,16 +972,6 @@ size_t kiwi::splitByTrie(
if (curNode->fail())
{
curNode = curNode->fail();
for (auto submatcher = curNode; submatcher; submatcher = submatcher->fail())
{
const Form* cand = submatcher->val(trie);
if (!cand) break;
else if (!trie.hasSubmatch(cand))
{
zCodaFollowable = zCodaFollowable || getZCodaAppendable<typoTolerant>(cand, formBase);
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
}
}
nextNode = curNode->template nextOpt<arch>(trie, c);
}
else
Expand Down Expand Up @@ -1161,7 +1151,7 @@ size_t kiwi::splitByTrie(
const Form* cand = node.second->val(trie);
if (cand && !trie.hasSubmatch(cand))
{
insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * node.first, node.first);
insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * (3 + node.first), node.first);
}
}
}
Expand Down
21 changes: 19 additions & 2 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,8 +775,25 @@ namespace kiwi
spStateCnt[r.curState]++;
validTarget++;
}
ret.erase(ret.begin() + validTarget, ret.end());
spStatesByRet.erase(spStatesByRet.begin() + validTarget, spStatesByRet.end());
Vector<size_t> idx(validTarget);
iota(idx.begin(), idx.end(), 0);
sort(idx.begin(), idx.end(), [&](size_t a, size_t b) { return ret[a].second > ret[b].second; });

Vector<TokenResult> sortedRet;
Vector<SpecialState> sortedSpStatesByRet;
const size_t maxCands = min(topN * 2, validTarget);
for (size_t i = 0; i < maxCands; ++i)
{
sortedRet.emplace_back(move(ret[idx[i]]));
sortedSpStatesByRet.emplace_back(spStatesByRet[idx[i]]);
}
ret.clear();
spStatesByRet.clear();
for (size_t i = 0; i < maxCands; ++i)
{
ret.emplace_back(move(sortedRet[i]));
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
}
}

inline void makePretokenizedSpanGroup(
Expand Down
Loading
Loading