Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

사이시옷 분석 기능 보강 #199

Merged
merged 5 commits into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.12)

project(kiwi VERSION 0.19.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
project(kiwi VERSION 0.20.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")

set ( CMAKE_CXX_STANDARD 14 )
set ( CMAKE_VERBOSE_MAKEFILE true )
Expand Down
8 changes: 4 additions & 4 deletions ModelGenerator/morphemes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6370,7 +6370,7 @@
와인 NNG 270
은혜 NNG 270
공평 NNG 270
횟수 NNG 270
횟수 NNG 270 complex 회/NNG ᆺ/Z_SIOT 수/NNG 010112
반짝이 VV 270 complex 반짝/MAG 이/XSV 0223
서랍 NNG 270
허무 NNG 270
Expand Down Expand Up @@ -14689,7 +14689,7 @@ LG화학 NNP 82
조흥은행 NNP 75
노라 EC 75
영양가 NNG 75
툇마루 NNG 75
툇마루 NNG 75 complex 퇴/NNG ᆺ/Z_SIOT 마루/NNG 010113
오묘 XR 75
의미심장 XR 75
주인집 NNG 75
Expand Down Expand Up @@ -16670,7 +16670,7 @@ LG화학 NNP 82
막중 XR 61
엄중 XR 61
경박 XR 61
셋방 NNG 61
셋방 NNG 61 complex 세/NNG ᆺ/Z_SIOT 방/NNG 010112
애무 NNG 61
천진 NNG 61
맞아들이 VV 61 complex 맞/VV 어/EC 들이/VV 011224
Expand Down Expand Up @@ -23962,7 +23962,7 @@ SK그룹 NNP 33
판매자 NNG 33
차두리 NNP 33
자필 NNG 33
곳간 NNG 33
곳간 NNG 33 complex 고/NNG ᆺ/Z_SIOT 간/NNB 010112
에베레스트 NNP 33
국전 NNG 33
온존 NNG 33
Expand Down
2 changes: 1 addition & 1 deletion bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

public class Kiwi implements AutoCloseable {
private long _inst;
final private static String _version = "0.19.1";
final private static String _version = "0.20.0";

public static class Match {
final static public int none = 0,
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Form.h
* @author bab2min ([email protected])
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Kiwi.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API를 담고 있는 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
4 changes: 2 additions & 2 deletions include/kiwi/Macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#define KIWI_STR(x) KIWI_STR_HELPER(x)

#define KIWI_VERSION_MAJOR 0
#define KIWI_VERSION_MINOR 19
#define KIWI_VERSION_PATCH 1
#define KIWI_VERSION_MINOR 20
#define KIWI_VERSION_PATCH 0

#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)
2 changes: 1 addition & 1 deletion include/kiwi/SwTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file SwTokenizer.h
* @author bab2min ([email protected])
* @brief Subword Tokenizer
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Types.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/TypoTransformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file TypoTransformer.h
* @author bab2min ([email protected])
* @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다.
* @version 0.19.0
* @version 0.20.0
* @date 2024-09-15
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file capi.h
* @author bab2min ([email protected])
* @brief Kiwi C API를 담고 있는 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
4 changes: 2 additions & 2 deletions models/base/sj.morph
Git LFS file not shown
1 change: 1 addition & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,7 @@ namespace kiwi
false,
!!(matchOptions & Match::splitComplex),
!!(matchOptions & Match::splitSaisiot),
!!(matchOptions & Match::mergeSaisiot),
blocklist
);
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
Expand Down
46 changes: 29 additions & 17 deletions src/PathEvaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace kiwi
bool openEnd,
bool splitComplex = false,
bool splitSaisiot = false,
bool mergeSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);

Expand All @@ -136,6 +137,7 @@ namespace kiwi
const Vector<SpecialState>& prevSpStates,
bool splitComplex = false,
bool splitSaisiot = false,
bool mergeSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);

Expand Down Expand Up @@ -525,7 +527,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -570,7 +572,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -622,7 +624,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -659,7 +661,7 @@ namespace kiwi

const Morpheme* lastMorph;
Wid firstWid;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
lastMorph = curMorph->getCombined() ? curMorph->getCombined() : curMorph;
firstWid = curMorph->lmMorphemeId;
Expand Down Expand Up @@ -691,8 +693,10 @@ namespace kiwi
{
for (auto& prevPath : cache[prev - startNode])
{
// 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외
if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag))
// 사이시옷 뒤에 명사가 아닌 태그가 오거나 공백이 있는 경우 제외
if (prevPath.morpheme->tag == POSTag::z_siot && (
!isNNClass(curMorph->tag) || prev->endPos < node->startPos
))
{
continue;
}
Expand All @@ -701,7 +705,7 @@ namespace kiwi
if (prevPath.combineSocket)
{
// merge <v> <chunk> with only the same socket
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex))
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
continue;
}
Expand Down Expand Up @@ -747,7 +751,7 @@ namespace kiwi
}

auto cLmState = prevPath.lmState;
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex))
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
// no-op
}
Expand All @@ -760,7 +764,7 @@ namespace kiwi
}
float ll = cLmState.next(langMdl, firstWid);
candScore += ll;
if (!(curMorph->chunks.empty() || curMorph->complex))
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
for (size_t i = 1; i < curMorph->chunks.size(); ++i)
{
Expand Down Expand Up @@ -833,6 +837,7 @@ namespace kiwi
const Vector<SpecialState>& prevSpStates,
bool splitComplex,
bool splitSaisiot,
bool mergeSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
Expand Down Expand Up @@ -893,6 +898,11 @@ namespace kiwi
// 사이시옷(zSiot)을 위한 지름길
if (curMorph->tag == POSTag::z_siot)
{
if (!(splitSaisiot || mergeSaisiot))
{
continue;
}

for (auto* prev = node->getPrev(); prev; prev = prev->getSibling())
{
for (auto& p : cache[prev - startNode])
Expand All @@ -912,7 +922,7 @@ namespace kiwi
}

// if the morpheme has chunk set
if (!(curMorph->chunks.empty()|| curMorph->complex))
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
// '하다/하게/하지'가 '다/게/지'로 축약된 경우인데 앞에 공백이 있는 경우는 탐색후보에서 제외
if (node->prev && node[-(int)node->prev].endPos < node->startPos
Expand Down Expand Up @@ -1019,13 +1029,13 @@ namespace kiwi
float scoreDiff = cur->accScore - prev->accScore;
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
auto morpheme = cur->morpheme;
size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex) ? 1 : morpheme->chunks.size();
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
auto& gNode = graph[csearcher(cur)];
scoreDiff += typoCostDiff * typoCostWeight;
scoreDiff /= numNewTokens;
typoCostDiff /= numNewTokens;

if (morpheme->chunks.empty() || morpheme->complex)
if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
{
ret.emplace_back(
unifyMorpheme(morpheme),
Expand Down Expand Up @@ -1093,6 +1103,7 @@ namespace kiwi
bool openEnd,
bool splitComplex,
bool splitSaisiot,
bool mergeSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
Expand Down Expand Up @@ -1148,24 +1159,24 @@ namespace kiwi
{
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, node->form->candidate,
false, uniqStates, splitComplex, splitSaisiot, blocklist);
false, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m)
{
return m->combineSocket || (!m->chunks.empty() && !m->complex);
return m->combineSocket || !(m->chunks.empty() || m->complex || m->saisiot);
}))
{
ownFormList.emplace_back(node->form->form);
ownFormId = ownFormList.size();
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeLCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
};
}
else
{
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
}

#ifdef DEBUG_PRINT
Expand All @@ -1186,13 +1197,14 @@ namespace kiwi
for (auto& p : cache[prev - startNode])
{
if (p.combineSocket) continue;
if (!p.morpheme->chunks.empty() && !p.morpheme->complex)
if (!(p.morpheme->chunks.empty() || p.morpheme->complex || p.morpheme->saisiot))
{
if (p.morpheme->chunks.size() <= (p.morpheme->combineSocket ? 2 : 1))
{
if (!FeatureTestor::isMatched(nullptr, p.morpheme->vowel)) continue;
}
}
if (p.morpheme->tag == POSTag::z_siot) continue;

float c = p.accScore + (openEnd ? 0 : p.lmState.next(kw->langMdl, eosId));
if (p.spState.singleQuote) c -= 2;
Expand Down
11 changes: 11 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,13 +994,24 @@ TEST(KiwiCpp, ZSiot)
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
EXPECT_EQ(resSplit.first.size(), 3);
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot);
EXPECT_EQ(resSplit.first[2].tag, POSTag::nng);
EXPECT_EQ(resMerge.first.size(), 1);
EXPECT_EQ(resMerge.first[0].tag, POSTag::nng);
}

for (auto s : {u"발렛 파킹", u"미닛"})
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_EQ(resNone.second, resSplit.second);
EXPECT_EQ(resNone.second, resMerge.second);
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
}
}

TEST(KiwiCpp, AnalyzeWithWordPosition)
Expand Down
Loading