Skip to content

Commit

Permalink
util/collate: implement utf8mb4_0900_ai_ci collation (#45650)
Browse files Browse the repository at this point in the history
close #37566
  • Loading branch information
YangKeao authored Aug 15, 2023
1 parent d426bcd commit 6fd1924
Show file tree
Hide file tree
Showing 36 changed files with 64,000 additions and 115 deletions.
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@

# Declare files that will always have LF line endings on checkout.
*.y text eol=lf

util/collate/unicode_0*_ci.go linguist-generated=true
util/collate/ucadata/unicode_*_data.go linguist-generated=true
39 changes: 39 additions & 0 deletions LICENSES/Unicode-DFS-2016-LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE

COPYRIGHT AND PERMISSION NOTICE

Copyright © 1991-2023 Unicode, Inc.

NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.

Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.

THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.

IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.

Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
5 changes: 5 additions & 0 deletions build/nogo_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -1238,20 +1238,23 @@
"exclude_files": {
"parser/parser.go": "parser/parser.go code",
".*_test.go": "ignore test code",
".*_generated\\.go$": "ignore generated code",
"external/": "no need to vet third party code"
}
},
"deferrecover": {
"exclude_files": {
"parser/parser.go": "parser/parser.go code",
".*_test.go": "ignore test code",
".*_generated\\.go$": "ignore generated code",
"external/": "no need to vet third party code"
}
},
"QF1002": {
"exclude_files": {
"parser/parser.go": "parser/parser.go code",
".*_test.go": "ignore test code",
".*_generated\\.go$": "ignore generated code",
"external/": "no need to vet third party code"
}
},
Expand All @@ -1260,6 +1263,7 @@
"parser/parser.go": "parser/parser.go code",
".*_test.go": "ignore test code",
"external/": "no need to vet third party code",
".*_generated\\.go$": "ignore generated code",
"/cgo/": "no need to vet third party code for cgo"
}
},
Expand All @@ -1268,6 +1272,7 @@
"parser/parser.go": "parser/parser.go code",
".*_test.go": "ignore test code",
"external/": "no need to vet third party code",
".*_generated\\.go$": "ignore generated code",
"/cgo/": "no need to vet third party code for cgo"
}
}
Expand Down
2 changes: 2 additions & 0 deletions cmd/explaintest/r/collation_misc_enabled.result
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ latin1 47 1
utf8 83 1
utf8 33 1
utf8 192 1
utf8mb4 255 1
utf8mb4 46 1
utf8mb4 45 1
utf8mb4 224 1
Expand All @@ -128,6 +129,7 @@ latin1_bin latin1 47 Yes Yes 1
utf8_bin utf8 83 Yes Yes 1
utf8_general_ci utf8 33 Yes 1
utf8_unicode_ci utf8 192 Yes 1
utf8mb4_0900_ai_ci utf8mb4 255 Yes 1
utf8mb4_bin utf8mb4 46 Yes Yes 1
utf8mb4_general_ci utf8mb4 45 Yes 1
utf8mb4_unicode_ci utf8mb4 224 Yes 1
Expand Down
2 changes: 2 additions & 0 deletions ddl/table_modify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ func TestCreateTable(t *testing.T) {
tk.MustGetErrCode("create table t_enum (a enum('abc','Abc')) charset=utf8 collate=utf8_general_ci;", errno.ErrDuplicatedValueInType)
tk.MustGetErrCode("create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
tk.MustGetErrCode("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
tk.MustGetErrCode("create table t_enum (a enum('æ','ae')) charset=utf8mb4 collate=utf8mb4_0900_ai_ci;", errno.ErrDuplicatedValueInType)
// test for set column
tk.MustGetErrCode("create table t_enum (a set('e','e'));", errno.ErrDuplicatedValueInType)
tk.MustGetErrCode("create table t_enum (a set('e','E')) charset=utf8 collate=utf8_general_ci;", errno.ErrDuplicatedValueInType)
Expand All @@ -92,6 +93,7 @@ func TestCreateTable(t *testing.T) {
tk.MustGetErrCode("create table t_enum (a set('e','E')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
tk.MustGetErrCode("create table t_enum (a set('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", errno.ErrDuplicatedValueInType)
tk.MustGetErrMsg("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;", "[types:1291]Column 'a' has duplicated value 'ß' in ENUM")
tk.MustGetErrCode("create table t_enum (a set('æ','ae')) charset=utf8mb4 collate=utf8mb4_0900_ai_ci;", errno.ErrDuplicatedValueInType)

// test for table option "union" not supported
tk.MustExec("use test")
Expand Down
1 change: 1 addition & 0 deletions executor/test/seqtest/seq_executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,7 @@ func TestShowForNewCollations(t *testing.T) {
"utf8_bin utf8 83 Yes Yes 1",
"utf8_general_ci utf8 33 Yes 1",
"utf8_unicode_ci utf8 192 Yes 1",
"utf8mb4_0900_ai_ci utf8mb4 255 Yes 1",
"utf8mb4_bin utf8mb4 46 Yes Yes 1",
"utf8mb4_general_ci utf8mb4 45 Yes 1",
"utf8mb4_unicode_ci utf8mb4 224 Yes 1",
Expand Down
69 changes: 42 additions & 27 deletions expression/builtin_like_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,37 +100,40 @@ func TestRegexp(t *testing.T) {
func TestCILike(t *testing.T) {
ctx := createContext(t)
tests := []struct {
input string
pattern string
generalMatch int
unicodeMatch int
input string
pattern string
generalMatch int
unicodeMatch int
unicode0900Match int
}{
{"a", "", 0, 0},
{"a", "a", 1, 1},
{"a", "á", 1, 1},
{"a", "b", 0, 0},
{"aA", "Aa", 1, 1},
{"áAb", `Aa%`, 1, 1},
{"áAb", `%ab%`, 1, 1},
{"áAb", `%ab`, 1, 1},
{"ÀAb", "aA_", 1, 1},
{"áééá", "a_%a", 1, 1},
{"áééá", "a%_a", 1, 1},
{"áéá", "a_%a", 1, 1},
{"áéá", "a%_a", 1, 1},
{"áá", "a_%a", 0, 0},
{"áá", "a%_a", 0, 0},
{"áééáííí", "a_%a%", 1, 1},
{"a", "", 0, 0, 0},
{"a", "a", 1, 1, 1},
{"a", "á", 1, 1, 1},
{"a", "b", 0, 0, 0},
{"aA", "Aa", 1, 1, 1},
{"áAb", `Aa%`, 1, 1, 1},
{"áAb", `%ab%`, 1, 1, 1},
{"áAb", `%ab`, 1, 1, 1},
{"ÀAb", "aA_", 1, 1, 1},
{"áééá", "a_%a", 1, 1, 1},
{"áééá", "a%_a", 1, 1, 1},
{"áéá", "a_%a", 1, 1, 1},
{"áéá", "a%_a", 1, 1, 1},
{"áá", "a_%a", 0, 0, 0},
{"áá", "a%_a", 0, 0, 0},
{"áééáííí", "a_%a%", 1, 1, 1},
{"数汉据字库", "数%据_库", 1, 1, 1},

// performs matching on a per-character basis
// https://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html#operator_like
{"ß", "s%", 1, 0},
{"ß", "%s", 1, 0},
{"ß", "ss", 0, 0},
{"ß", "s", 1, 0},
{"ss", "%ß%", 1, 0},
{"ß", "_", 1, 1},
{"ß", "__", 0, 0},
{"ß", "s%", 1, 0, 0},
{"ß", "%s", 1, 0, 0},
{"ß", "ss", 0, 0, 0},
{"ß", "s", 1, 0, 0},
{"ss", "%ß%", 1, 0, 0},
{"ß", "_", 1, 1, 1},
{"ß", "__", 0, 0, 0},
{"Ⱕ", "ⱕ", 0, 0, 1},
}
for _, tt := range tests {
comment := fmt.Sprintf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
Expand All @@ -155,4 +158,16 @@ func TestCILike(t *testing.T) {
require.NoError(t, err, comment)
testutil.DatumEqual(t, types.NewDatum(tt.unicodeMatch), r, comment)
}

for _, tt := range tests {
comment := fmt.Sprintf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
fc := funcs[ast.Like]
inputs := datumsToConstants(types.MakeDatums(tt.input, tt.pattern, 0))
f, err := fc.getFunction(ctx, inputs)
require.NoError(t, err, comment)
f.setCollator(collate.GetCollator("utf8mb4_0900_ai_ci"))
r, err := evalBuiltinFunc(f, chunk.Row{})
require.NoError(t, err, comment)
testutil.DatumEqual(t, types.NewDatum(tt.unicode0900Match), r, comment)
}
}
20 changes: 19 additions & 1 deletion expression/builtin_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2708,7 +2708,7 @@ func TestCIWeightString(t *testing.T) {
}
res, err := result.ToString()
require.NoError(t, err)
require.Equal(t, test.expect, res)
require.Equal(t, test.expect, res, "test case: '%s' '%s' %d", test.str, test.padding, test.length)
}
}

Expand Down Expand Up @@ -2746,6 +2746,24 @@ func TestCIWeightString(t *testing.T) {
{"中", "BINARY", 5, "中\x00\x00"},
}

unicode0900Tests := []weightStringTest{
{"aAÁàãăâ", "NONE", 0, "\x1cG\x1cG\x1cG\x1cG\x1cG\x1cG\x1cG"},
{"中", "NONE", 0, "\xfb\x40\xce\x2d"},
{"a", "CHAR", 5, "\x1c\x47\x02\x09\x02\x09\x02\x09\x02\x09"},
{"a ", "CHAR", 5, "\x1c\x47\x02\x09\x02\x09\x02\x09\x02\x09"},
{"中", "CHAR", 5, "\xfb\x40\xce\x2d\x02\x09\x02\x09\x02\x09\x02\x09"},
{"中 ", "CHAR", 5, "\xfb\x40\xce\x2d\x02\x09\x02\x09\x02\x09\x02\x09"},
{"a", "BINARY", 1, "a"},
{"ab", "BINARY", 1, "a"},
{"a", "BINARY", 5, "a\x00\x00\x00\x00"},
{"a ", "BINARY", 5, "a \x00\x00\x00"},
{"中", "BINARY", 1, "\xe4"},
{"中", "BINARY", 2, "\xe4\xb8"},
{"中", "BINARY", 3, "中"},
{"中", "BINARY", 5, "中\x00\x00"},
}

checkResult("utf8mb4_general_ci", generalTests)
checkResult("utf8mb4_unicode_ci", unicodeTests)
checkResult("utf8mb4_0900_ai_ci", unicode0900Tests)
}
9 changes: 9 additions & 0 deletions expression/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,15 @@ func TestCompareString(t *testing.T) {
require.NotEqual(t, 0, types.CompareString("ß", "s", "utf8_unicode_ci"))
require.Equal(t, 0, types.CompareString("ß", "ss", "utf8_unicode_ci"))

require.Equal(t, 0, types.CompareString("a", "A", "utf8mb4_0900_ai_ci"))
require.Equal(t, 0, types.CompareString("À", "A", "utf8mb4_0900_ai_ci"))
require.NotEqual(t, 0, types.CompareString("😜", "😃", "utf8mb4_0900_ai_ci"))
require.NotEqual(t, 0, types.CompareString("a ", "a ", "utf8mb4_0900_ai_ci"))
require.NotEqual(t, 0, types.CompareString("ß", "s", "utf8mb4_0900_ai_ci"))
require.Equal(t, 0, types.CompareString("ß", "ss", "utf8mb4_0900_ai_ci"))
require.NotEqual(t, 0, types.CompareString("\U000FFFFE", "\U000FFFFF", "utf8mb4_0900_ai_ci"))
require.Equal(t, 0, types.CompareString("æ", "ae", "utf8mb4_0900_ai_ci"))

require.NotEqual(t, 0, types.CompareString("a", "A", "binary"))
require.NotEqual(t, 0, types.CompareString("À", "A", "binary"))
require.NotEqual(t, 0, types.CompareString("😜", "😃", "binary"))
Expand Down
9 changes: 9 additions & 0 deletions expression/test/collation/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
load("@io_bazel_rules_go//go:def.bzl", "go_test")

go_test(
name = "collation_test",
timeout = "short",
srcs = ["uca_test.go"],
flaky = True,
deps = ["//testkit"],
)
47 changes: 47 additions & 0 deletions expression/test/collation/uca_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collation

import (
"testing"

"github.com/pingcap/tidb/testkit"
)

func TestUTF8MB40900AICIOrder(t *testing.T) {
store := testkit.CreateMockStore(t)

tk := testkit.NewTestKit(t, store)
tk.MustExec("USE test;")
tk.MustExec("create table t (id int primary key auto_increment, str VARCHAR(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci)")
tk.MustExec("insert into t(str) values ('カ'), ('カ'), ('abc'), ('abuFFFEc'), ('abⓒ'), ('𝒶bc'), ('𝕒bc'), ('ガ'), ('が'), ('abç'), ('äbc'), ('ヵ'), ('か'), ('Abc'), ('abC'), ('File-3'), ('file-12'), ('filé-110'), ('🍣'), ('🍺')")
tk.MustQuery("select min(id) from t group by str order by str").Check(testkit.Rows(
"19", "20", "3", "4", "18", "17", "16", "1"))
}

func TestUTF8MB40900AICIStrFunc(t *testing.T) {
store := testkit.CreateMockStore(t)

tk := testkit.NewTestKit(t, store)
tk.MustExec("USE test;")
// test locate
tk.MustQuery("select LOCATE('bar' collate utf8mb4_0900_ai_ci, 'FOOBAR' collate utf8mb4_0900_ai_ci)").Check(
testkit.Rows("4"),
)
// test regexp
tk.MustQuery("select 'FOOBAR' collate utf8mb4_0900_ai_ci REGEXP 'foo.*' collate utf8mb4_0900_ai_ci").Check(
testkit.Rows("1"),
)
}
2 changes: 1 addition & 1 deletion server/tests/tidb_serial_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ func TestDefaultCharacterAndCollation(t *testing.T) {
variable string
except string
}{
{"collation_connection", "utf8mb4_bin"},
{"collation_connection", "utf8mb4_0900_ai_ci"},
{"character_set_connection", "utf8mb4"},
{"character_set_client", "utf8mb4"},
}
Expand Down
7 changes: 5 additions & 2 deletions util/collate/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@ go_library(
"gbk_chinese_ci_data.go",
"general_ci.go",
"pinyin_tidb_as_cs.go",
"unicode_ci.go",
"unicode_ci_data.go",
"unicode_0400_ci_generated.go",
"unicode_0400_ci_impl.go",
"unicode_0900_ai_ci_generated.go",
"unicode_0900_ai_ci_impl.go",
],
importpath = "github.com/pingcap/tidb/util/collate",
visibility = ["//visibility:public"],
deps = [
"//parser/charset",
"//parser/mysql",
"//parser/terror",
"//util/collate/ucadata",
"//util/dbterror",
"//util/hack",
"//util/logutil",
Expand Down
7 changes: 6 additions & 1 deletion util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@ func runeLen(b byte) int {
// IsCICollation returns if the collation is case-insensitive
func IsCICollation(collate string) bool {
return collate == "utf8_general_ci" || collate == "utf8mb4_general_ci" ||
collate == "utf8_unicode_ci" || collate == "utf8mb4_unicode_ci" || collate == "gbk_chinese_ci"
collate == "utf8_unicode_ci" || collate == "utf8mb4_unicode_ci" || collate == "gbk_chinese_ci" ||
collate == "utf8mb4_0900_ai_ci"
}

// ConvertAndGetBinCollation converts collator to binary collator
Expand All @@ -339,6 +340,8 @@ func ConvertAndGetBinCollation(collate string) Collator {
return GetCollator("utf8mb4_bin")
case "utf8mb4_unicode_ci":
return GetCollator("utf8mb4_bin")
case "utf8mb4_0900_ai_ci":
return GetCollator("utf8mb4_bin")
case "gbk_chinese_ci":
return GetCollator("gbk_bin")
}
Expand Down Expand Up @@ -407,6 +410,8 @@ func init() {
newCollatorIDMap[CollationName2ID("utf8_general_ci")] = &generalCICollator{}
newCollatorMap["utf8mb4_unicode_ci"] = &unicodeCICollator{}
newCollatorIDMap[CollationName2ID("utf8mb4_unicode_ci")] = &unicodeCICollator{}
newCollatorMap["utf8mb4_0900_ai_ci"] = &unicode0900AICICollator{}
newCollatorIDMap[CollationName2ID("utf8mb4_0900_ai_ci")] = &unicode0900AICICollator{}
newCollatorMap["utf8_unicode_ci"] = &unicodeCICollator{}
newCollatorIDMap[CollationName2ID("utf8_unicode_ci")] = &unicodeCICollator{}
newCollatorMap["utf8mb4_zh_pinyin_tidb_as_cs"] = &zhPinyinTiDBASCSCollator{}
Expand Down
Loading

0 comments on commit 6fd1924

Please sign in to comment.