From c6658fec1afe23dc9b376e6e82959544a2731f0d Mon Sep 17 00:00:00 2001 From: "Hamster.Xu" <449211678@qq.com> Date: Tue, 9 Jan 2024 12:07:32 +0800 Subject: [PATCH] Update EdgeNgramTokenizer.php fixed EdgeNgramTokenizer split word count --- src/Support/EdgeNgramTokenizer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Support/EdgeNgramTokenizer.php b/src/Support/EdgeNgramTokenizer.php index 2cd6858..c29ee7f 100644 --- a/src/Support/EdgeNgramTokenizer.php +++ b/src/Support/EdgeNgramTokenizer.php @@ -13,7 +13,7 @@ public function tokenize($text, $stopwords = []) $splits = preg_split($this->getPattern(), $text, -1, PREG_SPLIT_NO_EMPTY); foreach ($splits as $split) { - for ($i = 2; $i <= strlen($split); $i++) { + for ($i = 2; $i <= mb_strlen($split); $i++) { $ngrams[] = mb_substr($split, 0, $i); } }