merge code from main

matrixorigin · Nov 30, 2024 · 49b0a36 · 49b0a36
2 parents dc8c8a8 + a50a3fd
commit 49b0a36
Show file tree

Hide file tree

Showing 58 changed files with 127,407 additions and 534 deletions.
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ Contents
 MatrixOne is a hyper-converged cloud & edge native distributed database with a structure that separates storage, computation, and transactions to form a consolidated HSTAP data engine. This engine enables a single database system to accommodate diverse business loads such as OLTP, OLAP, and stream computing. It also supports deployment and utilization across public, private, and edge clouds, ensuring compatibility with diverse infrastructures.
 
 <p align="center">
-  <img alt="MatrixOne" height="450" src="https://github.com/matrixorigin/artwork/blob/main/docs/overview/architecture/archi-en-1.png?raw=true">
+  <img alt="MatrixOne" height="450" src="https://github.com/matrixorigin/artwork/blob/main/docs/overview/architecture/architeture241113_en.png?raw=true">
 </p>
 
 ## 🎯 <a id="key-features">Key Features</a>

diff --git a/README_CN.md b/README_CN.md
@@ -58,7 +58,7 @@
 
 MatrixOne 是一款超融合异构分布式数据库，通过云原生化和存储、计算、事务分离的架构构建 HSTAP 超融合数据引擎，实现单一数据库系统支持 OLTP、OLAP、流计算等多种业务负载，并且支持公有云、私有云、边缘云部署和使用，实现异构基础设施的兼容。
 <p align="center">
-  <img alt="MatrixOne" height="500" src="https://community-shared-data-1308875761.cos.ap-beijing.myqcloud.com/artwork/docs/overview/mo-new-arch.png?raw=true">
+  <img alt="MatrixOne" height="500" src="https://community-shared-data-1308875761.cos.ap-beijing.myqcloud.com/artwork/docs/overview/architecture.png?raw=true">
 </p>
 
 ## 🎯 <a id="key-features">核心特性</a>

diff --git a/go.mod b/go.mod
@@ -53,7 +53,6 @@ require (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376
-	github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78
 	github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d
 	github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd
 	github.com/minio/minio-go/v7 v7.0.78

diff --git a/go.sum b/go.sum
@@ -503,8 +503,6 @@ github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 h1:+SmZP2bG
 github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4/go.mod h1:LIHvF0fflR+zyXUQFQOiHPpKANf3UIr7DFIv5CBPOoU=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76 h1:MpmqMPooJ0Ea7W4ldIGbQV4D3z+sEiCu6C6aTibiwiQ=
 github.com/matrixorigin/memberlist v0.5.1-0.20230322082342-95015c95ee76/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78 h1:1NvZ4SBw0lH7h38VhCVxYEa61K8N+0DBv9JQhAwU48Q=
-github.com/matrixorigin/monlp v0.0.0-20240825091235-be436dc30e78/go.mod h1:RQQhaM4xSocKuNi0ZvKZZAiErpINJgZrPB+vZDvBkeU=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d h1:27vD3JGbrFmaQtDYQT/W1jFFr0xvipdwH5R4bZPGQdE=
 github.com/matrixorigin/mysql v1.8.2-0.20241106110439-6ac9ee94770d/go.mod h1:RJNMd/LBgWRCpGanqXvqjVaoYXeYBS+i0MSeoN3hBMo=
 github.com/matrixorigin/simdcsv v0.0.0-20230210060146-09b8e45209dd h1:DvqhuH3kOpsE6vXZA5WEaRNAUUUcf44S1p5VInbjdfU=

diff --git a/pkg/catalog/secondary_index_utils.go b/pkg/catalog/secondary_index_utils.go
@@ -178,7 +178,7 @@ func fullTextIndexParamsToMap(def *tree.FullTextIndex) (map[string]string, error
 	// fulltext index here
 	if def.IndexOption != nil {
 		parsername := strings.ToLower(def.IndexOption.ParserName)
-		if parsername != "ngram" && parsername != "default" && parsername != "json" {
+		if parsername != "ngram" && parsername != "default" && parsername != "json" && parsername != "json_value" {
 			return nil, moerr.NewInternalErrorNoCtx(fmt.Sprintf("invalid parser %s", parsername))
 		}
 		res["parser"] = parsername

diff --git a/pkg/container/bytejson/fttokenizer.go b/pkg/container/bytejson/fttokenizer.go
@@ -17,31 +17,39 @@ package bytejson
 import (
 	"iter"
 	"strconv"
+)
 
-	"github.com/matrixorigin/monlp/tokenizer"
+const (
+	MAX_TOKEN_SIZE = 127
 )
 
+type Token struct {
+	TokenBytes [1 + MAX_TOKEN_SIZE]byte
+	TokenPos   int32
+	BytePos    int32
+}
+
 // TokenizeValue tokenizes the values of the ByteJson object
 // note that we do not break word with space, do not normalize
-// case, 3-gram, etc etc, only truncate the string to 23 bytes.
-func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[tokenizer.Token] {
-	return func(yield func(tokenizer.Token) bool) {
+// case, 3-gram, etc etc, only truncate the string to 127 bytes.
+func (bj ByteJson) TokenizeValue(includeKey bool) iter.Seq[Token] {
+	return func(yield func(Token) bool) {
 		tokenizeOne(bj, 1, includeKey, yield)
 	}
 }
 
-func fillToken(t *tokenizer.Token, s []byte, pos int32) {
+func fillToken(t *Token, s []byte, pos int32) {
 	copy(t.TokenBytes[1:], s)
-	if len(s) > tokenizer.MAX_TOKEN_SIZE {
-		t.TokenBytes[0] = tokenizer.MAX_TOKEN_SIZE
+	if len(s) > MAX_TOKEN_SIZE {
+		t.TokenBytes[0] = MAX_TOKEN_SIZE
 	} else {
 		t.TokenBytes[0] = byte(len(s))
 	}
 	t.TokenPos = pos
 }
 
-func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(tokenizer.Token) bool) int32 {
-	var t tokenizer.Token
+func tokenizeOne(bj ByteJson, pos int32, includeKey bool, yield func(Token) bool) int32 {
+	var t Token
 
 	switch bj.Type {
 	case TpCodeObject:

diff --git a/pkg/container/bytejson/fttokenizer_test.go b/pkg/container/bytejson/fttokenizer_test.go
@@ -16,9 +16,10 @@ package bytejson
 
 import (
 	"encoding/json"
+	"fmt"
 	"testing"
 
-	"github.com/matrixorigin/monlp/tokenizer"
+	"github.com/stretchr/testify/require"
 )
 
 type tokenTestCase struct {
@@ -27,15 +28,15 @@ type tokenTestCase struct {
 	tokensWithKey []string
 }
 
-func checkTokens(t *testing.T, tokens []tokenizer.Token, expected []string) {
+func checkTokens(t *testing.T, tokens []Token, expected []string) {
 	if len(tokens) != len(expected) {
 		t.Fatalf("expected %d tokens, got %d", len(expected), len(tokens))
 	}
 
 	for i := range tokens {
-		var tk tokenizer.Token
-		if len(expected[i]) > tokenizer.MAX_TOKEN_SIZE {
-			tk.TokenBytes[0] = byte(tokenizer.MAX_TOKEN_SIZE)
+		var tk Token
+		if len(expected[i]) > MAX_TOKEN_SIZE {
+			tk.TokenBytes[0] = byte(MAX_TOKEN_SIZE)
 		} else {
 			tk.TokenBytes[0] = byte(len(expected[i]))
 		}
@@ -61,8 +62,8 @@ func TestByteJson(t *testing.T) {
 		},
 		{
 			input:         `{"a": [1.2, 2.0], "b": [3, true, "hello"], "c": "abcdefghijklmnopqrstuvwxyz"}`,
-			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvw"},
-			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvw"},
+			tokens:        []string{"1.2", "2", "3", "hello", "abcdefghijklmnopqrstuvwxyz"},
+			tokensWithKey: []string{"a", "1.2", "2", "b", "3", "hello", "c", "abcdefghijklmnopqrstuvwxyz"},
 		},
 		{
 			input:         `{"a": "相见时难别亦难", "b": "I come, I see, I 征服", "c": "相见时难别亦难，东风无力百花残。 春蚕到死丝方尽，蜡炬成灰泪始干。"}`,
@@ -72,7 +73,7 @@ func TestByteJson(t *testing.T) {
 		{
 			input:         `{"a bcdefghijklmnopqrstuvwxyz": 1, "学而时习之，不亦说乎": "说什么说， 就你话多"}`,
 			tokens:        []string{"1", "说什么说， 就你话多"},
-			tokensWithKey: []string{"a bcdefghijklmnopqrstuv", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
+			tokensWithKey: []string{"a bcdefghijklmnopqrstuvwxyz", "1", "学而时习之，不亦说乎", "说什么说， 就你话多"},
 		},
 	}
 
@@ -82,16 +83,31 @@ func TestByteJson(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		var tokens []tokenizer.Token
+		var tokens []Token
 		for tk := range bj.TokenizeValue(false) {
 			tokens = append(tokens, tk)
 		}
 		checkTokens(t, tokens, tc.tokens)
 
-		var tokensWithKey []tokenizer.Token
+		var tokensWithKey []Token
 		for tk := range bj.TokenizeValue(true) {
 			tokensWithKey = append(tokensWithKey, tk)
 		}
 		checkTokens(t, tokensWithKey, tc.tokensWithKey)
 	}
 }
+
+func TestFillToken(t *testing.T) {
+	var tok Token
+	lv := "1234567890"
+	fmt.Printf("%s %d\n", lv, len(lv))
+
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 10, int(tok.TokenBytes[0]))
+
+	for i := 0; i < 20; i++ {
+		lv += lv
+	}
+	fillToken(&tok, []byte(lv), 0)
+	require.Equal(t, 127, int(tok.TokenBytes[0]))
+}