From 3d267514d57952d0e47dab9f48e02e823d146baf Mon Sep 17 00:00:00 2001 From: xiongjiwei Date: Wed, 24 Nov 2021 17:07:50 +0800 Subject: [PATCH] expression: cast charset according to the function's resulting charset (#29905) --- .../r/new_character_set_builtin.result | 204 ++++++++++++++++++ cmd/explaintest/r/select.result | 4 + .../t/new_character_set_builtin.test | 59 +++++ cmd/explaintest/t/select.test | 5 + errno/errcode.go | 1 + errno/errname.go | 1 + expression/builtin.go | 11 +- expression/builtin_convert_charset.go | 192 ++++++++++++++--- expression/builtin_string.go | 2 +- expression/collation.go | 3 +- expression/constant_test.go | 43 +++- expression/distsql_builtin.go | 4 + expression/expression.go | 1 + expression/integration_test.go | 2 +- expression/scalar_function.go | 4 + types/datum.go | 4 +- 16 files changed, 495 insertions(+), 45 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_builtin.result b/cmd/explaintest/r/new_character_set_builtin.result index ec7732486ed8e..a1e547c4219eb 100644 --- a/cmd/explaintest/r/new_character_set_builtin.result +++ b/cmd/explaintest/r/new_character_set_builtin.result @@ -286,3 +286,207 @@ select ord(a), ord(b), ord(c) from t; ord(a) ord(b) ord(c) 14989485 54992 228 set @@tidb_enable_vectorized_expression = false; +drop table if exists t; +create table t (a char(20) charset utf8mb4, b char(20) charset gbk, c binary(20)); +insert into t values ('一', '一', 0xe4b880); +insert into t values ('一', '一', 0xd2bb); +insert into t values ('一', '一', 0xe4ba8c); +insert into t values ('一', '一', 0xb6fe); +set @@tidb_enable_vectorized_expression = true; +select hex(concat(a, c)), hex(concat(b, c)) from t; +hex(concat(a, c)) hex(concat(b, c)) +E4B880E4B8800000000000000000000000000000000000 D2BBE4B8800000000000000000000000000000000000 +E4B880D2BB000000000000000000000000000000000000 D2BBD2BB000000000000000000000000000000000000 +E4B880E4BA8C0000000000000000000000000000000000 D2BBE4BA8C0000000000000000000000000000000000 +E4B880B6FE000000000000000000000000000000000000 D2BBB6FE000000000000000000000000000000000000 +select hex(concat(a, 0xe4b880)), hex(concat(b, 0xd2bb)) from t; +hex(concat(a, 0xe4b880)) hex(concat(b, 0xd2bb)) +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +select a = 0xe4b880, b = 0xd2bb from t; +a = 0xe4b880 b = 0xd2bb +1 1 +1 1 +1 1 +1 1 +select a = c, b = c from t; +a = c b = c +0 0 +0 0 +0 0 +0 0 +select hex(insert(a, 1, 2, 0xe4ba8c)), hex(insert(b, 1, 2, 0xb6fe)) from t; +hex(insert(a, 1, 2, 0xe4ba8c)) hex(insert(b, 1, 2, 0xb6fe)) +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +select hex(insert(a, 1, 2, c)), hex(insert(b, 1, 2, c)) from t; +hex(insert(a, 1, 2, c)) hex(insert(b, 1, 2, c)) +E4B880000000000000000000000000000000000080 E4B8800000000000000000000000000000000000 +D2BB00000000000000000000000000000000000080 D2BB000000000000000000000000000000000000 +E4BA8C000000000000000000000000000000000080 E4BA8C0000000000000000000000000000000000 +B6FE00000000000000000000000000000000000080 B6FE000000000000000000000000000000000000 +select hex(lpad(a, 5, 0xe4ba8c)), hex(lpad(b, 5, 0xb6fe)) from t; +hex(lpad(a, 5, 0xe4ba8c)) hex(lpad(b, 5, 0xb6fe)) +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +select hex(lpad(a, 5, c)), hex(lpad(b, 5, c)) from t; +hex(lpad(a, 5, c)) hex(lpad(b, 5, c)) +E4B8E4B880 E4B880D2BB +D2BBE4B880 D2BB00D2BB +E4BAE4B880 E4BA8CD2BB +B6FEE4B880 B6FE00D2BB +select hex(rpad(a, 5, 0xe4ba8c)), hex(rpad(b, 5, 0xb6fe)) from t; +hex(rpad(a, 5, 0xe4ba8c)) hex(rpad(b, 5, 0xb6fe)) +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +select hex(rpad(a, 5, c)), hex(rpad(b, 5, c)) from t; +hex(rpad(a, 5, c)) hex(rpad(b, 5, c)) +E4B880E4B8 D2BBE4B880 +E4B880D2BB D2BBD2BB00 +E4B880E4BA D2BBE4BA8C +E4B880B6FE D2BBB6FE00 +select hex(elt(2, a, 0xe4ba8c)), hex(elt(2, b, 0xb6fe)) from t; +hex(elt(2, a, 0xe4ba8c)) hex(elt(2, b, 0xb6fe)) +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +select hex(elt(2, a, c)), hex(elt(2, b, c)) from t; +hex(elt(2, a, c)) hex(elt(2, b, c)) +E4B8800000000000000000000000000000000000 E4B8800000000000000000000000000000000000 +D2BB000000000000000000000000000000000000 D2BB000000000000000000000000000000000000 +E4BA8C0000000000000000000000000000000000 E4BA8C0000000000000000000000000000000000 +B6FE000000000000000000000000000000000000 B6FE000000000000000000000000000000000000 +select hex(instr(a, 0xe4b880)), hex(instr(b, 0xd2bb)) from t; +hex(instr(a, 0xe4b880)) hex(instr(b, 0xd2bb)) +1 1 +1 1 +1 1 +1 1 +select hex(position(a in 0xe4b880)), hex(position(b in 0xd2bb)) from t; +hex(position(a in 0xe4b880)) hex(position(b in 0xd2bb)) +1 1 +1 1 +1 1 +1 1 +select a like 0xe4b880, b like 0xd2bb from t; +a like 0xe4b880 b like 0xd2bb +1 1 +1 1 +1 1 +1 1 +select a = 0xb6fe from t; +Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +select b = 0xe4ba8c from t; +Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +select concat(a, 0xb6fe) from t; +Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +select concat(b, 0xe4ba8c) from t; +Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +set @@tidb_enable_vectorized_expression = false; +select hex(concat(a, c)), hex(concat(b, c)) from t; +hex(concat(a, c)) hex(concat(b, c)) +E4B880E4B8800000000000000000000000000000000000 D2BBE4B8800000000000000000000000000000000000 +E4B880D2BB000000000000000000000000000000000000 D2BBD2BB000000000000000000000000000000000000 +E4B880E4BA8C0000000000000000000000000000000000 D2BBE4BA8C0000000000000000000000000000000000 +E4B880B6FE000000000000000000000000000000000000 D2BBB6FE000000000000000000000000000000000000 +select hex(concat(a, 0xe4b880)), hex(concat(b, 0xd2bb)) from t; +hex(concat(a, 0xe4b880)) hex(concat(b, 0xd2bb)) +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +E4B880E4B880 D2BBD2BB +select a = 0xe4b880, b = 0xd2bb from t; +a = 0xe4b880 b = 0xd2bb +1 1 +1 1 +1 1 +1 1 +select a = c, b = c from t; +a = c b = c +0 0 +0 0 +0 0 +0 0 +select hex(insert(a, 1, 2, 0xe4ba8c)), hex(insert(b, 1, 2, 0xb6fe)) from t; +hex(insert(a, 1, 2, 0xe4ba8c)) hex(insert(b, 1, 2, 0xb6fe)) +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +select hex(insert(a, 1, 2, c)), hex(insert(b, 1, 2, c)) from t; +hex(insert(a, 1, 2, c)) hex(insert(b, 1, 2, c)) +E4B880000000000000000000000000000000000080 E4B8800000000000000000000000000000000000 +D2BB00000000000000000000000000000000000080 D2BB000000000000000000000000000000000000 +E4BA8C000000000000000000000000000000000080 E4BA8C0000000000000000000000000000000000 +B6FE00000000000000000000000000000000000080 B6FE000000000000000000000000000000000000 +select hex(lpad(a, 5, 0xe4ba8c)), hex(lpad(b, 5, 0xb6fe)) from t; +hex(lpad(a, 5, 0xe4ba8c)) hex(lpad(b, 5, 0xb6fe)) +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +E4BA8CE4BA8CE4BA8CE4BA8CE4B880 B6FEB6FEB6FEB6FED2BB +select hex(lpad(a, 5, c)), hex(lpad(b, 5, c)) from t; +hex(lpad(a, 5, c)) hex(lpad(b, 5, c)) +E4B8E4B880 E4B880D2BB +D2BBE4B880 D2BB00D2BB +E4BAE4B880 E4BA8CD2BB +B6FEE4B880 B6FE00D2BB +select hex(rpad(a, 5, 0xe4ba8c)), hex(rpad(b, 5, 0xb6fe)) from t; +hex(rpad(a, 5, 0xe4ba8c)) hex(rpad(b, 5, 0xb6fe)) +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +E4B880E4BA8CE4BA8CE4BA8CE4BA8C D2BBB6FEB6FEB6FEB6FE +select hex(rpad(a, 5, c)), hex(rpad(b, 5, c)) from t; +hex(rpad(a, 5, c)) hex(rpad(b, 5, c)) +E4B880E4B8 D2BBE4B880 +E4B880D2BB D2BBD2BB00 +E4B880E4BA D2BBE4BA8C +E4B880B6FE D2BBB6FE00 +select hex(elt(2, a, 0xe4ba8c)), hex(elt(2, b, 0xb6fe)) from t; +hex(elt(2, a, 0xe4ba8c)) hex(elt(2, b, 0xb6fe)) +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +E4BA8C B6FE +select hex(elt(2, a, c)), hex(elt(2, b, c)) from t; +hex(elt(2, a, c)) hex(elt(2, b, c)) +E4B8800000000000000000000000000000000000 E4B8800000000000000000000000000000000000 +D2BB000000000000000000000000000000000000 D2BB000000000000000000000000000000000000 +E4BA8C0000000000000000000000000000000000 E4BA8C0000000000000000000000000000000000 +B6FE000000000000000000000000000000000000 B6FE000000000000000000000000000000000000 +select hex(instr(a, 0xe4b880)), hex(instr(b, 0xd2bb)) from t; +hex(instr(a, 0xe4b880)) hex(instr(b, 0xd2bb)) +1 1 +1 1 +1 1 +1 1 +select hex(position(a in 0xe4b880)), hex(position(b in 0xd2bb)) from t; +hex(position(a in 0xe4b880)) hex(position(b in 0xd2bb)) +1 1 +1 1 +1 1 +1 1 +select a like 0xe4b880, b like 0xd2bb from t; +a like 0xe4b880 b like 0xd2bb +1 1 +1 1 +1 1 +1 1 +select a = 0xb6fe from t; +Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +select b = 0xe4ba8c from t; +Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +select concat(a, 0xb6fe) from t; +Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +select concat(b, 0xe4ba8c) from t; +Error 3854: Cannot convert string 'E4BA8C' from binary to gbk diff --git a/cmd/explaintest/r/select.result b/cmd/explaintest/r/select.result index 15b71d0e65166..959e761aad086 100644 --- a/cmd/explaintest/r/select.result +++ b/cmd/explaintest/r/select.result @@ -495,3 +495,7 @@ insert into precise_types values ( SELECT a, b, c, d FROM precise_types; a b c d 18446744073709551614 -9223372036854775806 99999999999999999999.0 1.8446744073709552e19 +create table t3(a char(10), primary key (a)); +insert into t3 values ('a'); +select * from t3 where a > 0x80; +Error 1105: Cannot convert string '80' from binary to utf8mb4 diff --git a/cmd/explaintest/t/new_character_set_builtin.test b/cmd/explaintest/t/new_character_set_builtin.test index 855dd215711de..ff4c2a9354f74 100644 --- a/cmd/explaintest/t/new_character_set_builtin.test +++ b/cmd/explaintest/t/new_character_set_builtin.test @@ -161,3 +161,62 @@ select ord(a), ord(b), ord(c) from t; set @@tidb_enable_vectorized_expression = true; select ord(a), ord(b), ord(c) from t; set @@tidb_enable_vectorized_expression = false; + +drop table if exists t; +create table t (a char(20) charset utf8mb4, b char(20) charset gbk, c binary(20)); +insert into t values ('一', '一', 0xe4b880); +insert into t values ('一', '一', 0xd2bb); +insert into t values ('一', '一', 0xe4ba8c); +insert into t values ('一', '一', 0xb6fe); + +set @@tidb_enable_vectorized_expression = true; +select hex(concat(a, c)), hex(concat(b, c)) from t; +select hex(concat(a, 0xe4b880)), hex(concat(b, 0xd2bb)) from t; +select a = 0xe4b880, b = 0xd2bb from t; +select a = c, b = c from t; +select hex(insert(a, 1, 2, 0xe4ba8c)), hex(insert(b, 1, 2, 0xb6fe)) from t; +select hex(insert(a, 1, 2, c)), hex(insert(b, 1, 2, c)) from t; +select hex(lpad(a, 5, 0xe4ba8c)), hex(lpad(b, 5, 0xb6fe)) from t; +select hex(lpad(a, 5, c)), hex(lpad(b, 5, c)) from t; +select hex(rpad(a, 5, 0xe4ba8c)), hex(rpad(b, 5, 0xb6fe)) from t; +select hex(rpad(a, 5, c)), hex(rpad(b, 5, c)) from t; +select hex(elt(2, a, 0xe4ba8c)), hex(elt(2, b, 0xb6fe)) from t; +select hex(elt(2, a, c)), hex(elt(2, b, c)) from t; +select hex(instr(a, 0xe4b880)), hex(instr(b, 0xd2bb)) from t; +select hex(position(a in 0xe4b880)), hex(position(b in 0xd2bb)) from t; +select a like 0xe4b880, b like 0xd2bb from t; + +--error ER_CANNOT_CONVERT_STRING +select a = 0xb6fe from t; +--error ER_CANNOT_CONVERT_STRING +select b = 0xe4ba8c from t; +--error ER_CANNOT_CONVERT_STRING +select concat(a, 0xb6fe) from t; +--error ER_CANNOT_CONVERT_STRING +select concat(b, 0xe4ba8c) from t; + +set @@tidb_enable_vectorized_expression = false; +select hex(concat(a, c)), hex(concat(b, c)) from t; +select hex(concat(a, 0xe4b880)), hex(concat(b, 0xd2bb)) from t; +select a = 0xe4b880, b = 0xd2bb from t; +select a = c, b = c from t; +select hex(insert(a, 1, 2, 0xe4ba8c)), hex(insert(b, 1, 2, 0xb6fe)) from t; +select hex(insert(a, 1, 2, c)), hex(insert(b, 1, 2, c)) from t; +select hex(lpad(a, 5, 0xe4ba8c)), hex(lpad(b, 5, 0xb6fe)) from t; +select hex(lpad(a, 5, c)), hex(lpad(b, 5, c)) from t; +select hex(rpad(a, 5, 0xe4ba8c)), hex(rpad(b, 5, 0xb6fe)) from t; +select hex(rpad(a, 5, c)), hex(rpad(b, 5, c)) from t; +select hex(elt(2, a, 0xe4ba8c)), hex(elt(2, b, 0xb6fe)) from t; +select hex(elt(2, a, c)), hex(elt(2, b, c)) from t; +select hex(instr(a, 0xe4b880)), hex(instr(b, 0xd2bb)) from t; +select hex(position(a in 0xe4b880)), hex(position(b in 0xd2bb)) from t; +select a like 0xe4b880, b like 0xd2bb from t; + +--error ER_CANNOT_CONVERT_STRING +select a = 0xb6fe from t; +--error ER_CANNOT_CONVERT_STRING +select b = 0xe4ba8c from t; +--error ER_CANNOT_CONVERT_STRING +select concat(a, 0xb6fe) from t; +--error ER_CANNOT_CONVERT_STRING +select concat(b, 0xe4ba8c) from t; diff --git a/cmd/explaintest/t/select.test b/cmd/explaintest/t/select.test index 77ace4a6d3512..c53aca600e2b9 100644 --- a/cmd/explaintest/t/select.test +++ b/cmd/explaintest/t/select.test @@ -244,3 +244,8 @@ insert into precise_types values ( 18446744073709551614 ); SELECT a, b, c, d FROM precise_types; + +create table t3(a char(10), primary key (a)); +insert into t3 values ('a'); +--error ER_CANNOT_CONVERT_STRING +select * from t3 where a > 0x80; diff --git a/errno/errcode.go b/errno/errcode.go index 288ce56181dbc..e0c5f7cccb0a7 100644 --- a/errno/errcode.go +++ b/errno/errcode.go @@ -901,6 +901,7 @@ const ( ErrFKIncompatibleColumns = 3780 ErrFunctionalIndexRowValueIsNotAllowed = 3800 ErrDependentByFunctionalIndex = 3837 + ErrCannotConvertString = 3854 ErrInvalidJSONValueForFuncIndex = 3903 ErrJSONValueOutOfRangeForFuncIndex = 3904 ErrFunctionalIndexDataIsTooLong = 3907 diff --git a/errno/errname.go b/errno/errname.go index 27431bebf8b91..607672e19b816 100644 --- a/errno/errname.go +++ b/errno/errname.go @@ -896,6 +896,7 @@ var MySQLErrName = map[uint16]*mysql.ErrMessage{ ErrFKIncompatibleColumns: mysql.Message("Referencing column '%s' in foreign key constraint '%s' are incompatible", nil), ErrFunctionalIndexRowValueIsNotAllowed: mysql.Message("Expression of expression index '%s' cannot refer to a row value", nil), ErrDependentByFunctionalIndex: mysql.Message("Column '%s' has an expression index dependency and cannot be dropped or renamed", nil), + ErrCannotConvertString: mysql.Message("Cannot convert string '%.64s' from %s to %s", nil), ErrInvalidJSONValueForFuncIndex: mysql.Message("Invalid JSON value for CAST for expression index '%s'", nil), ErrJSONValueOutOfRangeForFuncIndex: mysql.Message("Out of range JSON value for CAST for expression index '%s'", nil), ErrFunctionalIndexDataIsTooLong: mysql.Message("Data too long for expression index '%s'", nil), diff --git a/expression/builtin.go b/expression/builtin.go index a7e9537e84a52..316e3a2ecb462 100644 --- a/expression/builtin.go +++ b/expression/builtin.go @@ -140,7 +140,7 @@ func newBaseBuiltinFuncWithTp(ctx sessionctx.Context, funcName string, args []Ex args[i] = WrapWithCastAsDecimal(ctx, args[i]) case types.ETString: args[i] = WrapWithCastAsString(ctx, args[i]) - args[i] = WrapWithToBinary(ctx, args[i], funcName) + args[i] = HandleBinaryLiteral(ctx, args[i], ec, funcName) case types.ETDatetime: args[i] = WrapWithCastAsTime(ctx, args[i], types.NewFieldType(mysql.TypeDatetime)) case types.ETTimestamp: @@ -880,9 +880,6 @@ var funcs = map[string]functionClass{ ast.NextVal: &nextValFunctionClass{baseFunctionClass{ast.NextVal, 1, 1}}, ast.LastVal: &lastValFunctionClass{baseFunctionClass{ast.LastVal, 1, 1}}, ast.SetVal: &setValFunctionClass{baseFunctionClass{ast.SetVal, 2, 2}}, - - // TiDB implicit internal functions. - InternalFuncToBinary: &tidbConvertCharsetFunctionClass{baseFunctionClass{InternalFuncToBinary, 1, 1}}, } // IsFunctionSupported check if given function name is a builtin sql function. @@ -906,7 +903,6 @@ func GetDisplayName(name string) string { func GetBuiltinList() []string { res := make([]string, 0, len(funcs)) notImplementedFunctions := []string{ast.RowFunc, ast.IsTruthWithNull} - implicitFunctions := []string{InternalFuncToBinary} for funcName := range funcs { skipFunc := false // Skip not implemented functions @@ -915,11 +911,6 @@ func GetBuiltinList() []string { skipFunc = true } } - for _, implicitFunc := range implicitFunctions { - if funcName == implicitFunc { - skipFunc = true - } - } // Skip literal functions // (their names are not readable: 'tidb`.(dateliteral, for example) // See: https://github.com/pingcap/parser/pull/591 diff --git a/expression/builtin_convert_charset.go b/expression/builtin_convert_charset.go index b2d6e3f19f60e..aac9d1680ddda 100644 --- a/expression/builtin_convert_charset.go +++ b/expression/builtin_convert_charset.go @@ -16,24 +16,43 @@ package expression import ( "fmt" + "unicode/utf8" + "github.com/pingcap/tidb/errno" "github.com/pingcap/tidb/parser/ast" "github.com/pingcap/tidb/parser/charset" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/chunk" + "github.com/pingcap/tidb/util/dbterror" "github.com/pingcap/tipb/go-tipb" ) +var ( + _ functionClass = &tidbToBinaryFunctionClass{} + _ functionClass = &tidbFromBinaryFunctionClass{} + + _ builtinFunc = &builtinInternalToBinarySig{} + _ builtinFunc = &builtinInternalFromBinarySig{} +) + +var ( + // errCannotConvertString returns when the string can not convert to other charset. + errCannotConvertString = dbterror.ClassExpression.NewStd(errno.ErrCannotConvertString) +) + // InternalFuncToBinary accepts a string and returns another string encoded in a given charset. const InternalFuncToBinary = "to_binary" -type tidbConvertCharsetFunctionClass struct { +// InternalFuncFromBinary accepts a string and returns another string decode in a given charset. +const InternalFuncFromBinary = "from_binary" + +type tidbToBinaryFunctionClass struct { baseFunctionClass } -func (c *tidbConvertCharsetFunctionClass) getFunction(ctx sessionctx.Context, args []Expression) (builtinFunc, error) { +func (c *tidbToBinaryFunctionClass) getFunction(ctx sessionctx.Context, args []Expression) (builtinFunc, error) { if err := c.verifyArgs(args); err != nil { return nil, c.verifyArgs(args) } @@ -45,6 +64,8 @@ func (c *tidbConvertCharsetFunctionClass) getFunction(ctx sessionctx.Context, ar if err != nil { return nil, err } + bf.tp = args[0].GetType().Clone() + bf.tp.Charset, bf.tp.Collate = charset.CharsetBin, charset.CollationBin sig = &builtinInternalToBinarySig{bf} sig.setPbCode(tipb.ScalarFuncSig_ToBinary) default: @@ -53,8 +74,6 @@ func (c *tidbConvertCharsetFunctionClass) getFunction(ctx sessionctx.Context, ar return sig, nil } -var _ builtinFunc = &builtinInternalToBinarySig{} - type builtinInternalToBinarySig struct { baseBuiltinFunc } @@ -107,30 +126,155 @@ func (b *builtinInternalToBinarySig) vecEvalString(input *chunk.Chunk, result *c return nil } -// toBinaryMap contains the builtin functions which arguments need to be converted to the correct charset. -var toBinaryMap = map[string]struct{}{ - ast.Hex: {}, ast.Length: {}, ast.OctetLength: {}, ast.ASCII: {}, - ast.ToBase64: {}, ast.AesDecrypt: {}, ast.Decode: {}, ast.Encode: {}, - ast.PasswordFunc: {}, ast.MD5: {}, ast.SHA: {}, ast.SHA1: {}, - ast.SHA2: {}, ast.Compress: {}, +type tidbFromBinaryFunctionClass struct { + baseFunctionClass + + tp *types.FieldType } -// WrapWithToBinary wraps `expr` with to_binary sig. -func WrapWithToBinary(ctx sessionctx.Context, expr Expression, funcName string) Expression { - exprTp := expr.GetType() - if _, err := charset.GetDefaultCollationLegacy(exprTp.Charset); err != nil { - if _, ok := toBinaryMap[funcName]; ok { - fc := funcs[InternalFuncToBinary] - sig, err := fc.getFunction(ctx, []Expression{expr}) - if err != nil { - return expr +func (c *tidbFromBinaryFunctionClass) getFunction(ctx sessionctx.Context, args []Expression) (builtinFunc, error) { + if err := c.verifyArgs(args); err != nil { + return nil, c.verifyArgs(args) + } + argTp := args[0].GetType().EvalType() + var sig builtinFunc + switch argTp { + case types.ETString: + bf, err := newBaseBuiltinFuncWithTp(ctx, c.funcName, args, types.ETString, types.ETString) + if err != nil { + return nil, err + } + bf.tp = c.tp + sig = &builtinInternalFromBinarySig{bf} + sig.setPbCode(tipb.ScalarFuncSig_FromBinary) + default: + return nil, fmt.Errorf("unexpected argTp: %d", argTp) + } + return sig, nil +} + +type builtinInternalFromBinarySig struct { + baseBuiltinFunc +} + +func (b *builtinInternalFromBinarySig) Clone() builtinFunc { + newSig := &builtinInternalFromBinarySig{} + newSig.cloneFrom(&b.baseBuiltinFunc) + return newSig +} + +func (b *builtinInternalFromBinarySig) evalString(row chunk.Row) (res string, isNull bool, err error) { + val, isNull, err := b.args[0].EvalString(b.ctx, row) + if isNull || err != nil { + return val, isNull, err + } + transferString := b.getTransferFunc() + tBytes, err := transferString([]byte(val)) + return string(tBytes), false, err +} + +func (b *builtinInternalFromBinarySig) vectorized() bool { + return true +} + +func (b *builtinInternalFromBinarySig) vecEvalString(input *chunk.Chunk, result *chunk.Column) error { + n := input.NumRows() + buf, err := b.bufAllocator.get() + if err != nil { + return err + } + defer b.bufAllocator.put(buf) + if err := b.args[0].VecEvalString(b.ctx, input, buf); err != nil { + return err + } + transferString := b.getTransferFunc() + result.ReserveString(n) + for i := 0; i < n; i++ { + if buf.IsNull(i) { + result.AppendNull() + continue + } + str, err := transferString(buf.GetBytes(i)) + if err != nil { + return err + } + result.AppendBytes(str) + } + return nil +} + +func (b *builtinInternalFromBinarySig) getTransferFunc() func([]byte) ([]byte, error) { + var transferString func([]byte) ([]byte, error) + if b.tp.Charset == charset.CharsetUTF8MB4 || b.tp.Charset == charset.CharsetUTF8 { + transferString = func(s []byte) ([]byte, error) { + if !utf8.Valid(s) { + return nil, errCannotConvertString.GenWithStackByArgs(fmt.Sprintf("%X", s), charset.CharsetBin, b.tp.Charset) } - sf := &ScalarFunction{ - FuncName: model.NewCIStr(InternalFuncToBinary), - RetType: exprTp, - Function: sig, + return s, nil + } + } else { + enc := charset.NewEncoding(b.tp.Charset) + var buf []byte + transferString = func(s []byte) ([]byte, error) { + str, err := enc.Decode(buf, s) + if err != nil { + return nil, errCannotConvertString.GenWithStackByArgs(fmt.Sprintf("%X", s), charset.CharsetBin, b.tp.Charset) } - return FoldConstant(sf) + return str, nil + } + } + return transferString +} + +// BuildToBinaryFunction builds to_binary function. +func BuildToBinaryFunction(ctx sessionctx.Context, expr Expression) (res Expression) { + fc := &tidbToBinaryFunctionClass{baseFunctionClass{InternalFuncToBinary, 1, 1}} + f, err := fc.getFunction(ctx, []Expression{expr}) + if err != nil { + return expr + } + res = &ScalarFunction{ + FuncName: model.NewCIStr(InternalFuncToBinary), + RetType: f.getRetTp(), + Function: f, + } + return FoldConstant(res) +} + +// BuildFromBinaryFunction builds from_binary function. +func BuildFromBinaryFunction(ctx sessionctx.Context, expr Expression, tp *types.FieldType) (res Expression) { + fc := &tidbFromBinaryFunctionClass{baseFunctionClass{InternalFuncFromBinary, 1, 1}, tp} + f, err := fc.getFunction(ctx, []Expression{expr}) + if err != nil { + return expr + } + res = &ScalarFunction{ + FuncName: model.NewCIStr(InternalFuncFromBinary), + RetType: tp, + Function: f, + } + return FoldConstant(res) +} + +// HandleBinaryLiteral wraps `expr` with to_binary or from_binary sig. +func HandleBinaryLiteral(ctx sessionctx.Context, expr Expression, ec *ExprCollation, funcName string) Expression { + switch funcName { + case ast.Concat, ast.ConcatWS, ast.Lower, ast.Lcase, ast.Reverse, ast.Upper, ast.Ucase, ast.Quote, ast.Coalesce, + ast.Left, ast.Right, ast.Repeat, ast.Trim, ast.LTrim, ast.RTrim, ast.Substr, ast.SubstringIndex, ast.Replace, + ast.Substring, ast.Mid, ast.Translate, ast.InsertFunc, ast.Lpad, ast.Rpad, ast.Elt, ast.ExportSet, ast.MakeSet, + ast.FindInSet, ast.Regexp, ast.Field, ast.Locate, ast.Instr, ast.Position, ast.GE, ast.LE, ast.GT, ast.LT, ast.EQ, + ast.NE, ast.NullEQ, ast.Strcmp, ast.If, ast.Ifnull, ast.Like, ast.In, ast.DateFormat, ast.TimeFormat: + if ec.Charset == charset.CharsetBin && expr.GetType().Charset != charset.CharsetBin { + return BuildToBinaryFunction(ctx, expr) + } else if ec.Charset != charset.CharsetBin && expr.GetType().Charset == charset.CharsetBin { + ft := expr.GetType().Clone() + ft.Charset, ft.Collate = ec.Charset, ec.Collation + return BuildFromBinaryFunction(ctx, expr, ft) + } + case ast.Hex, ast.Length, ast.OctetLength, ast.ASCII, ast.ToBase64, ast.AesDecrypt, ast.Decode, ast.Encode, + ast.PasswordFunc, ast.MD5, ast.SHA, ast.SHA1, ast.SHA2, ast.Compress: + if _, err := charset.GetDefaultCollationLegacy(expr.GetType().Charset); err != nil { + return BuildToBinaryFunction(ctx, expr) } } return expr diff --git a/expression/builtin_string.go b/expression/builtin_string.go index da2d8d3fe298e..38a5647d49fac 100644 --- a/expression/builtin_string.go +++ b/expression/builtin_string.go @@ -3672,7 +3672,7 @@ func (c *insertFunctionClass) getFunction(ctx sessionctx.Context, args []Express return nil, errors.Trace(err) } - if types.IsBinaryStr(args[0].GetType()) { + if types.IsBinaryStr(bf.tp) { sig = &builtinInsertSig{bf, maxAllowedPacket} sig.setPbCode(tipb.ScalarFuncSig_Insert) } else { diff --git a/expression/collation.go b/expression/collation.go index 7312e8f4c8413..80a2720c8cfe4 100644 --- a/expression/collation.go +++ b/expression/collation.go @@ -301,7 +301,8 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) continue } - if arg.Repertoire() == ASCII { + // If value has ASCII repertoire, or it is binary string, just skip it. + if arg.Repertoire() == ASCII || types.IsBinaryStr(arg.GetType()) { continue } diff --git a/expression/constant_test.go b/expression/constant_test.go index 2158b1e4f5d66..fc46c40f1072d 100644 --- a/expression/constant_test.go +++ b/expression/constant_test.go @@ -57,12 +57,11 @@ func newString(value string, collation string) *Constant { } func newFunction(funcName string, args ...Expression) Expression { - return newFunctionWithType(funcName, mysql.TypeLonglong, args...) + return newFunctionWithType(funcName, types.NewFieldType(mysql.TypeLonglong), args...) } -func newFunctionWithType(funcName string, tp byte, args ...Expression) Expression { - ft := types.NewFieldType(tp) - return NewFunctionInternal(mock.NewContext(), funcName, ft, args...) +func newFunctionWithType(funcName string, tp *types.FieldType, args ...Expression) Expression { + return NewFunctionInternal(mock.NewContext(), funcName, tp, args...) } func TestConstantPropagation(t *testing.T) { @@ -239,16 +238,48 @@ func TestConstantFoldingCharsetConvert(t *testing.T) { }{ { condition: newFunction(ast.Length, newFunctionWithType( - InternalFuncToBinary, mysql.TypeVarchar, + InternalFuncToBinary, types.NewFieldType(mysql.TypeVarchar), newString("中文", "gbk_bin"))), result: "4", }, { condition: newFunction(ast.Length, newFunctionWithType( - InternalFuncToBinary, mysql.TypeVarchar, + InternalFuncToBinary, types.NewFieldType(mysql.TypeVarchar), newString("中文", "utf8mb4_bin"))), result: "6", }, + { + condition: newFunction(ast.Concat, newFunctionWithType( + InternalFuncFromBinary, types.NewFieldType(mysql.TypeVarchar), + newString("中文", "binary"))), + result: "中文", + }, + { + condition: newFunction(ast.Concat, + newFunctionWithType( + InternalFuncFromBinary, types.NewFieldTypeWithCollation(mysql.TypeVarchar, "gbk_bin", -1), + newString("\xd2\xbb", "binary")), + newString("中文", "gbk_bin"), + ), + result: "一中文", + }, + { + condition: newFunction(ast.Concat, + newString("中文", "gbk_bin"), + newFunctionWithType( + InternalFuncFromBinary, types.NewFieldTypeWithCollation(mysql.TypeVarchar, "gbk_bin", -1), + newString("\xd2\xbb", "binary")), + ), + result: "中文一", + }, + // The result is binary charset, so gbk constant will convert to binary which is \xd6\xd0\xce\xc4. + { + condition: newFunction(ast.Concat, + newString("中文", "gbk_bin"), + newString("\xd2\xbb", "binary"), + ), + result: "\xd6\xd0\xce\xc4\xd2\xbb", + }, } for _, tt := range tests { newConds := FoldConstant(tt.condition) diff --git a/expression/distsql_builtin.go b/expression/distsql_builtin.go index db9b39a2db010..1763f71d2fb05 100644 --- a/expression/distsql_builtin.go +++ b/expression/distsql_builtin.go @@ -1049,6 +1049,10 @@ func getSignatureByPB(ctx sessionctx.Context, sigCode tipb.ScalarFuncSig, tp *ti f = &builtinUnHexSig{base} case tipb.ScalarFuncSig_Upper: f = &builtinUpperSig{base} + case tipb.ScalarFuncSig_ToBinary: + f = &builtinInternalToBinarySig{base} + case tipb.ScalarFuncSig_FromBinary: + f = &builtinInternalFromBinarySig{base} default: e = errFunctionNotExists.GenWithStackByArgs("FUNCTION", sigCode) diff --git a/expression/expression.go b/expression/expression.go index fda1615fd8b4e..e4d589f796194 100644 --- a/expression/expression.go +++ b/expression/expression.go @@ -967,6 +967,7 @@ func scalarExprSupportedByTiKV(sf *ScalarFunction) bool { // string functions. ast.Length, ast.BitLength, ast.Concat, ast.ConcatWS /*ast.Locate,*/, ast.Replace, ast.ASCII, ast.Hex, ast.Reverse, ast.LTrim, ast.RTrim /*ast.Left,*/, ast.Strcmp, ast.Space, ast.Elt, ast.Field, + InternalFuncFromBinary, InternalFuncToBinary, // json functions. ast.JSONType, ast.JSONExtract, ast.JSONObject, ast.JSONArray, ast.JSONMerge, ast.JSONSet, diff --git a/expression/integration_test.go b/expression/integration_test.go index 54e4eebedb79c..eca90299fd12b 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -1184,7 +1184,7 @@ func (s *testIntegrationSuite2) TestStringBuiltin(c *C) { // for insert result = tk.MustQuery(`select insert("中文", 1, 1, cast("aaa" as binary)), insert("ba", -1, 1, "aaa"), insert("ba", 1, 100, "aaa"), insert("ba", 100, 1, "aaa");`) - result.Check(testkit.Rows("aaa文 ba aaa ba")) + result.Check(testkit.Rows("aaa\xb8\xad文 ba aaa ba")) result = tk.MustQuery(`select insert("bb", NULL, 1, "aa"), insert("bb", 1, NULL, "aa"), insert(NULL, 1, 1, "aaa"), insert("bb", 1, 1, NULL);`) result.Check(testkit.Rows(" ")) result = tk.MustQuery(`SELECT INSERT("bb", 0, 1, NULL), INSERT("bb", 0, NULL, "aaa");`) diff --git a/expression/scalar_function.go b/expression/scalar_function.go index d024e3b62ff1e..2030186a53176 100644 --- a/expression/scalar_function.go +++ b/expression/scalar_function.go @@ -183,6 +183,10 @@ func newFunctionImpl(ctx sessionctx.Context, fold int, funcName string, retType return BuildCastFunction(ctx, args[0], retType), nil case ast.GetVar: return BuildGetVarFunction(ctx, args[0], retType) + case InternalFuncFromBinary: + return BuildFromBinaryFunction(ctx, args[0], retType), nil + case InternalFuncToBinary: + return BuildToBinaryFunction(ctx, args[0]), nil } fc, ok := funcs[funcName] if !ok { diff --git a/types/datum.go b/types/datum.go index dd0d34a0a5e1d..03874e1f02e35 100644 --- a/types/datum.go +++ b/types/datum.go @@ -1133,9 +1133,9 @@ func ProduceStrWithSpecifiedTp(s string, tp *FieldType, sc *stmtctx.StatementCon // overflowed part is all whitespaces var overflowed string var characterLen int - // Flen is the rune length, not binary length, for UTF8 charset, we need to calculate the + // Flen is the rune length, not binary length, for Non-binary charset, we need to calculate the // rune count and truncate to Flen runes if it is too long. - if chs == charset.CharsetUTF8 || chs == charset.CharsetUTF8MB4 { + if chs != charset.CharsetBinary { characterLen = utf8.RuneCountInString(s) if characterLen > flen { // 1. If len(s) is 0 and flen is 0, truncateLen will be 0, don't truncate s.