From ecfda1b76b33567690d4fec5cd34eeb369559798 Mon Sep 17 00:00:00 2001 From: groverlynn Date: Fri, 2 Feb 2024 22:51:08 +0100 Subject: [PATCH] allow interpuncts in phrases --- src/rime/algo/encoder.cc | 70 ++++++++++++++++++++++++++++++++++++---- src/rime/algo/encoder.h | 5 +++ 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/src/rime/algo/encoder.cc b/src/rime/algo/encoder.cc index 64290424a8..8eb59773d8 100644 --- a/src/rime/algo/encoder.cc +++ b/src/rime/algo/encoder.cc @@ -15,6 +15,13 @@ namespace rime { static const int kEncoderDfsLimit = 32; static const int kMaxPhraseLength = 32; +// U+00B7 MIDDLE DOT, U+2027 HYPHENATION POINT, U+2010 HYPHEN, +// U+FF0D FULLWIDTH HYPHEN-MINUS, U+FF0C FULLWIDTH COMMA +// U+FF08 FULLWIDTH LEFT PARENTHESIS, U+FF09 FULLWIDTH RIGHT PARENTHESIS +static const string& kDefaultFreePuncts = + "\xc2\xb7\xe2\x80\xa7\xe2\x80\x90\xef\xbc\x8d\xef\xbc\x8c" + "\xef\xbc\x88\xef\xbc\x89"; + string RawCode::ToString() const { return strings::join(*this, " "); } @@ -24,6 +31,28 @@ void RawCode::FromString(const string& code_str) { strings::split(code_str, " ", strings::SplitBehavior::SkipToken); } +// strip certain "free punctuations" from the phrase, whose encoding is then +// used in lieu of that of the original phrase; users get these punctuations +// "for free" +string Encoder::StripPuncts(const string& phrase) { + // in case phrase is one of the "free puncts", do not strip one off itself + if (free_puncts_.find(phrase) != string::npos) + return phrase; + string stripped_phrase; + size_t start_pos = 0; + while (start_pos < phrase.length()) { + const char* grapheme_start = phrase.c_str() + start_pos; + const char* grapheme_end = grapheme_start; + utf8::unchecked::next(grapheme_end); + size_t grapheme_len = grapheme_end - grapheme_start; + string grapheme(grapheme_start, grapheme_len); + if (free_puncts_.find(grapheme) == string::npos) + stripped_phrase += grapheme; + start_pos += grapheme_len; + } + return stripped_phrase; +} + TableEncoder::TableEncoder(PhraseCollector* collector) : Encoder(collector), loaded_(false), max_phrase_length_(0) {} @@ -48,6 +77,7 @@ bool TableEncoder::LoadSettings(Config* config) { encoding_rules_.clear(); exclude_patterns_.clear(); tail_anchor_.clear(); + free_puncts_.clear(); if (!config) return false; @@ -101,6 +131,13 @@ bool TableEncoder::LoadSettings(Config* config) { } config->GetString("encoder/tail_anchor", &tail_anchor_); + // an empty string means no free puncts, or all characters must be encoded + // however, if no setting is found, adopts the default puncts + if (!config->GetString("encoder/free_puncts", &free_puncts_) || + free_puncts_ == "default" || free_puncts_ == "preset") { + free_puncts_ = kDefaultFreePuncts; + } + loaded_ = !encoding_rules_.empty(); return loaded_; } @@ -234,8 +271,10 @@ int TableEncoder::CalculateCodeIndex(const string& code, int index, int start) { } bool TableEncoder::EncodePhrase(const string& phrase, const string& value) { + string stripped_phrase = TableEncoder::StripPuncts(phrase); size_t phrase_length = utf8::unchecked::distance( - phrase.c_str(), phrase.c_str() + phrase.length()); + stripped_phrase.c_str(), + stripped_phrase.c_str() + stripped_phrase.length()); if (static_cast(phrase_length) > max_phrase_length_) return false; @@ -249,7 +288,8 @@ bool TableEncoder::DfsEncode(const string& phrase, size_t start_pos, RawCode* code, int* limit) { - if (start_pos == phrase.length()) { + string stripped_phrase = TableEncoder::StripPuncts(phrase); + if (start_pos == stripped_phrase.length()) { if (limit) { --*limit; } @@ -265,7 +305,7 @@ bool TableEncoder::DfsEncode(const string& phrase, return false; } } - const char* word_start = phrase.c_str() + start_pos; + const char* word_start = stripped_phrase.c_str() + start_pos; const char* word_end = word_start; utf8::unchecked::next(word_end); size_t word_len = word_end - word_start; @@ -291,9 +331,24 @@ bool TableEncoder::DfsEncode(const string& phrase, ScriptEncoder::ScriptEncoder(PhraseCollector* collector) : Encoder(collector) {} +bool ScriptEncoder::LoadSettings(Config* config) { + free_puncts_.clear(); + + if (!config) + return false; + + if (!config->GetString("encoder/free_puncts", &free_puncts_) || + free_puncts_ == "default" || free_puncts_ == "preset") { + free_puncts_ = kDefaultFreePuncts; + } + return true; +} + bool ScriptEncoder::EncodePhrase(const string& phrase, const string& value) { + string stripped_phrase = ScriptEncoder::StripPuncts(phrase); size_t phrase_length = utf8::unchecked::distance( - phrase.c_str(), phrase.c_str() + phrase.length()); + stripped_phrase.c_str(), + stripped_phrase.c_str() + stripped_phrase.length()); if (static_cast(phrase_length) > kMaxPhraseLength) return false; @@ -307,7 +362,8 @@ bool ScriptEncoder::DfsEncode(const string& phrase, size_t start_pos, RawCode* code, int* limit) { - if (start_pos == phrase.length()) { + string stripped_phrase = ScriptEncoder::StripPuncts(phrase); + if (start_pos == stripped_phrase.length()) { if (limit) { --*limit; } @@ -315,8 +371,8 @@ bool ScriptEncoder::DfsEncode(const string& phrase, return true; } bool ret = false; - for (size_t k = phrase.length() - start_pos; k > 0; --k) { - string word(phrase.substr(start_pos, k)); + for (size_t k = stripped_phrase.length() - start_pos; k > 0; --k) { + string word(stripped_phrase.substr(start_pos, k)); vector translations; if (collector_->TranslateWord(word, &translations)) { for (const string& x : translations) { diff --git a/src/rime/algo/encoder.h b/src/rime/algo/encoder.h index 00290fe488..9d40567059 100644 --- a/src/rime/algo/encoder.h +++ b/src/rime/algo/encoder.h @@ -46,6 +46,9 @@ class Encoder { protected: PhraseCollector* collector_; + + string StripPuncts(const string& phrase); + string free_puncts_; }; // Aa : code at index 0 for character at index 0 @@ -103,6 +106,8 @@ class ScriptEncoder : public Encoder { public: ScriptEncoder(PhraseCollector* collector); + bool LoadSettings(Config* config); + bool EncodePhrase(const string& phrase, const string& value); private: