From 42bba991b9645a93696536851370af30c20db4f0 Mon Sep 17 00:00:00 2001 From: birdstorm Date: Tue, 31 Jul 2018 16:14:05 +0800 Subject: [PATCH] ranger: fix prefix index when charset is UTF-8 (#7194) --- expression/integration_test.go | 16 ++++++++++++++++ plan/physical_plan_test.go | 2 +- table/tables/index.go | 17 ++++++++--------- util/ranger/ranger.go | 35 ++++++++++++++++++++++++++-------- util/ranger/ranger_test.go | 16 +++++++++++++++- 5 files changed, 67 insertions(+), 19 deletions(-) diff --git a/expression/integration_test.go b/expression/integration_test.go index a3a224ac90359..c17d900929595 100644 --- a/expression/integration_test.go +++ b/expression/integration_test.go @@ -3428,4 +3428,20 @@ func (s *testIntegrationSuite) TestPrefixIndex(c *C) { tk.MustExec("insert into t1 values('借款策略集_网页');") res := tk.MustQuery("select * from t1 where name = '借款策略集_网页';") res.Check(testkit.Rows("借款策略集_网页")) + + tk.MustExec(`CREATE TABLE prefix ( + a int(11) NOT NULL, + b varchar(55) DEFAULT NULL, + c int(11) DEFAULT NULL, + PRIMARY KEY (a), + KEY prefix_index (b(2)), + KEY prefix_complex (a,b(2)) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;`) + + tk.MustExec("INSERT INTO prefix VALUES(0, 'b', 2), (1, 'bbb', 3), (2, 'bbc', 4), (3, 'bbb', 5), (4, 'abc', 6), (5, 'abc', 7), (6, 'abc', 7), (7, 'ÿÿ', 8), (8, 'ÿÿ0', 9), (9, 'ÿÿÿ', 10);") + res = tk.MustQuery("select c, b from prefix where b > 'ÿ' and b < 'ÿÿc'") + res.Check(testkit.Rows("8 ÿÿ", "9 ÿÿ0")) + + res = tk.MustQuery("select a, b from prefix where b LIKE 'ÿÿ%'") + res.Check(testkit.Rows("7 ÿÿ", "8 ÿÿ0", "9 ÿÿÿ")) } diff --git a/plan/physical_plan_test.go b/plan/physical_plan_test.go index f22800081db03..8fc193f08b381 100644 --- a/plan/physical_plan_test.go +++ b/plan/physical_plan_test.go @@ -178,7 +178,7 @@ func (s *testPlanSuite) TestDAGPlanBuilderSimpleCase(c *C) { // Test index filter condition push down. { sql: "select * from t use index(e_d_c_str_prefix) where t.c_str = 'abcdefghijk' and t.d_str = 'd' and t.e_str = 'e'", - best: "IndexLookUp(Index(t.e_d_c_str_prefix)[[\"e\" \"d\" \"[97 98 99 100 101 102 103 104 105 106]\",\"e\" \"d\" \"[97 98 99 100 101 102 103 104 105 106]\"]], Table(t)->Sel([eq(test.t.c_str, abcdefghijk)]))", + best: "IndexLookUp(Index(t.e_d_c_str_prefix)[[\"e\" \"d\" \"abcdefghij\",\"e\" \"d\" \"abcdefghij\"]], Table(t)->Sel([eq(test.t.c_str, abcdefghijk)]))", }, { sql: "select * from t use index(e_d_c_str_prefix) where t.e_str = b'1110000'", diff --git a/table/tables/index.go b/table/tables/index.go index 23255673b35e0..10dce5af0af6f 100644 --- a/table/tables/index.go +++ b/table/tables/index.go @@ -136,19 +136,18 @@ func (c *index) truncateIndexValuesIfNeeded(indexedValues []types.Datum) []types if v.Kind() == types.KindString || v.Kind() == types.KindBytes { ic := c.idxInfo.Columns[i] colCharset := c.tblInfo.Columns[ic.Offset].Charset - if colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4 { - val := v.GetBytes() - if ic.Length != types.UnspecifiedLength && utf8.RuneCount(val) > ic.Length { - rs := bytes.Runes(val) + colValue := v.GetBytes() + isUTF8Charset := colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4 + if isUTF8Charset { + if ic.Length != types.UnspecifiedLength && utf8.RuneCount(colValue) > ic.Length { + rs := bytes.Runes(colValue) truncateStr := string(rs[:ic.Length]) // truncate value and limit its length v.SetString(truncateStr) } - } else { - if ic.Length != types.UnspecifiedLength && len(v.GetBytes()) > ic.Length { - // truncate value and limit its length - v.SetBytes(v.GetBytes()[:ic.Length]) - } + } else if ic.Length != types.UnspecifiedLength && len(colValue) > ic.Length { + // truncate value and limit its length + v.SetBytes(colValue[:ic.Length]) } } } diff --git a/util/ranger/ranger.go b/util/ranger/ranger.go index 81344f3f8d10e..3f35efd83ee1f 100644 --- a/util/ranger/ranger.go +++ b/util/ranger/ranger.go @@ -17,6 +17,7 @@ import ( "bytes" "math" "sort" + "unicode/utf8" "github.com/juju/errors" "github.com/pingcap/tidb/ast" @@ -25,6 +26,7 @@ import ( "github.com/pingcap/tidb/mysql" "github.com/pingcap/tidb/sessionctx/stmtctx" "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/charset" "github.com/pingcap/tidb/util/codec" ) @@ -327,7 +329,7 @@ func buildCNFIndexRange(sc *stmtctx.StatementContext, cols []*expression.Column, // Take prefix index into consideration. if hasPrefix(lengths) { - fixPrefixColRange(ranges, lengths) + fixPrefixColRange(ranges, lengths, newTp) } if len(ranges) > 0 && len(ranges[0].LowVal) < len(cols) { @@ -410,23 +412,37 @@ func hasPrefix(lengths []int) bool { return false } -func fixPrefixColRange(ranges []*Range, lengths []int) { +func fixPrefixColRange(ranges []*Range, lengths []int, tp []*types.FieldType) { for _, ran := range ranges { for i := 0; i < len(ran.LowVal); i++ { - fixRangeDatum(&ran.LowVal[i], lengths[i]) + fixRangeDatum(&ran.LowVal[i], lengths[i], tp[i]) } ran.LowExclude = false for i := 0; i < len(ran.HighVal); i++ { - fixRangeDatum(&ran.HighVal[i], lengths[i]) + fixRangeDatum(&ran.HighVal[i], lengths[i], tp[i]) } ran.HighExclude = false } } -func fixRangeDatum(v *types.Datum, length int) { +func fixRangeDatum(v *types.Datum, length int, tp *types.FieldType) { // If this column is prefix and the prefix length is smaller than the range, cut it. - if length != types.UnspecifiedLength && length < len(v.GetBytes()) { - v.SetBytes(v.GetBytes()[:length]) + // In case of UTF8, prefix should be cut by characters rather than bytes + if v.Kind() == types.KindString || v.Kind() == types.KindBytes { + colCharset := tp.Charset + colValue := v.GetBytes() + isUTF8Charset := colCharset == charset.CharsetUTF8 || colCharset == charset.CharsetUTF8MB4 + if isUTF8Charset { + if length != types.UnspecifiedLength && utf8.RuneCount(colValue) > length { + rs := bytes.Runes(colValue) + truncateStr := string(rs[:length]) + // truncate value and limit its length + v.SetString(truncateStr) + } + } else if length != types.UnspecifiedLength && len(colValue) > length { + // truncate value and limit its length + v.SetBytes(colValue[:length]) + } } } @@ -438,11 +454,14 @@ func newFieldType(tp *types.FieldType) *types.FieldType { case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong: newTp := types.NewFieldType(mysql.TypeLonglong) newTp.Flag = tp.Flag + newTp.Charset = tp.Charset return newTp // To avoid data truncate error. case mysql.TypeFloat, mysql.TypeDouble, mysql.TypeBlob, mysql.TypeTinyBlob, mysql.TypeMediumBlob, mysql.TypeLongBlob, mysql.TypeString, mysql.TypeVarchar, mysql.TypeVarString: - return types.NewFieldType(tp.Tp) + newTp := types.NewFieldType(tp.Tp) + newTp.Charset = tp.Charset + return newTp default: return tp } diff --git a/util/ranger/ranger_test.go b/util/ranger/ranger_test.go index 78da0a36c2dac..62eeeb39e62fb 100644 --- a/util/ranger/ranger_test.go +++ b/util/ranger/ranger_test.go @@ -332,7 +332,7 @@ func (s *testRangerSuite) TestIndexRange(c *C) { testKit := testkit.NewTestKit(c, store) testKit.MustExec("use test") testKit.MustExec("drop table if exists t") - testKit.MustExec("create table t(a varchar(50), b int, c double, index idx_ab(a(50), b), index idx_cb(c, a))") + testKit.MustExec("create table t(a varchar(50), b int, c double, d varchar(10), e binary(10), index idx_ab(a(50), b), index idx_cb(c, a), index idx_d(d(2)), index idx_e(e(2)))") tests := []struct { indexPos int @@ -516,6 +516,20 @@ func (s *testRangerSuite) TestIndexRange(c *C) { filterConds: "[or(gt(test.t.a, a), gt(test.t.c, 1))]", resultStr: "[[NULL,+inf]]", }, + { + indexPos: 2, + exprStr: `d = "你好啊"`, + accessConds: "[eq(test.t.d, 你好啊)]", + filterConds: "[eq(test.t.d, 你好啊)]", + resultStr: "[[\"你好\",\"你好\"]]", + }, + { + indexPos: 3, + exprStr: `e = "你好啊"`, + accessConds: "[eq(test.t.e, 你好啊)]", + filterConds: "[eq(test.t.e, 你好啊)]", + resultStr: "[[\"[228 189]\",\"[228 189]\"]]", + }, } for _, tt := range tests {