diff --git a/statistics/handle/update.go b/statistics/handle/update.go index 5cb1cee0181ae..2014b1284cd61 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -1137,7 +1137,7 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error { expected := 0.0 if isIndex { idx := t.Indices[id] - expected, err = idx.GetRowCount(sc, ranges, t.ModifyCount) + expected, err = idx.GetRowCount(sc, nil, ranges, t.ModifyCount) expected *= idx.GetIncreaseFactor(t.Count) } else { c := t.Columns[id] diff --git a/statistics/histogram.go b/statistics/histogram.go index 202d46ee72a4f..d5a1a22742e1d 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -1189,7 +1189,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 { // GetRowCount returns the row count of the given ranges. // It uses the modifyCount to adjust the influence of modifications on the table. -func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { +func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { totalCount := float64(0) isSingleCol := len(idx.Info.Columns) == 1 for _, indexRange := range indexRanges { @@ -1226,7 +1226,6 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range } l := types.NewBytesDatum(lb) r := types.NewBytesDatum(rb) - totalCount += idx.BetweenRowCount(l, r) lowIsNull := bytes.Equal(lb, nullKeyBytes) if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() @@ -1234,6 +1233,17 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range if isSingleCol && lowIsNull { totalCount += float64(idx.NullCount) } + // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. + // If the first column's range is point. + if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer == Version2 && coll != nil { + expBackoffSel, err := idx.expBackoffEstimation(sc, coll, indexRange) + if err != nil { + return 0, err + } + totalCount += expBackoffSel * idx.TotalRowCount() + } else { + totalCount += idx.BetweenRowCount(l, r) + } } if totalCount > idx.TotalRowCount() { totalCount = idx.TotalRowCount() @@ -1241,6 +1251,64 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range return totalCount, nil } +// expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details. +func (idx *Index) expBackoffEstimation(sc *stmtctx.StatementContext, coll *HistColl, indexRange *ranger.Range) (float64, error) { + tmpRan := []*ranger.Range{ + { + LowVal: make([]types.Datum, 1), + HighVal: make([]types.Datum, 1), + }, + } + colsIDs := coll.Idx2ColumnIDs[idx.ID] + singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal)) + // The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like: + // 1. Calc the selectivity of each column. + // 2. Sort them and choose the first 4 most selective filter and the corresponding selectivity is sel_1, sel_2, sel_3, sel_4 where i < j => sel_i < sel_j. + // 3. The final selectivity would be sel_1 * sel_2^{1/2} * sel_3^{1/4} * sel_4^{1/8}. + // This calculation reduced the independence assumption and can work well better than it. + for i := 0; i < len(indexRange.LowVal); i++ { + tmpRan[0].LowVal[0] = indexRange.LowVal[i] + tmpRan[0].HighVal[0] = indexRange.HighVal[i] + if i == len(indexRange.LowVal)-1 { + tmpRan[0].LowExclude = indexRange.LowExclude + tmpRan[0].HighExclude = indexRange.HighExclude + } + colID := colsIDs[i] + var ( + count float64 + err error + ) + if anotherIdxID, ok := coll.ColID2IdxID[colID]; ok && anotherIdxID != idx.ID { + count, err = coll.GetRowCountByIndexRanges(sc, anotherIdxID, tmpRan) + } else if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sc, coll.Pseudo) { + count, err = coll.GetRowCountByColumnRanges(sc, colID, tmpRan) + } else { + continue + } + if err != nil { + return 0, err + } + singleColumnEstResults = append(singleColumnEstResults, count) + } + // Sort them. + sort.Slice(singleColumnEstResults, func(i, j int) bool { + return singleColumnEstResults[i] < singleColumnEstResults[j] + }) + l := len(singleColumnEstResults) + // Convert the first 4 to selectivity results. + for i := 0; i < l && i < 4; i++ { + singleColumnEstResults[i] = singleColumnEstResults[i] / float64(coll.Count) + } + if l == 1 { + return singleColumnEstResults[0], nil + } else if l == 2 { + return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]), nil + } else if l == 3 { + return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) * math.Sqrt(math.Sqrt(singleColumnEstResults[2])), nil + } + return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))), nil +} + type countByRangeFunc = func(*stmtctx.StatementContext, int64, []*ranger.Range) (float64, error) // newHistogramBySelectivity fulfills the content of new histogram by the given selectivity result. diff --git a/statistics/integration_test.go b/statistics/integration_test.go index c2a8dd904c1d1..4d75569cdf41d 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -19,27 +19,31 @@ import ( "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/util/testkit" "github.com/pingcap/tidb/util/testleak" + "github.com/pingcap/tidb/util/testutil" ) var _ = Suite(&testIntegrationSuite{}) type testIntegrationSuite struct { - store kv.Storage - do *domain.Domain + store kv.Storage + do *domain.Domain + testData testutil.TestData } func (s *testIntegrationSuite) SetUpSuite(c *C) { testleak.BeforeTest() - // Add the hook here to avoid data race. var err error s.store, s.do, err = newStoreWithBootstrap() c.Assert(err, IsNil) + s.testData, err = testutil.LoadTestSuiteData("testdata", "integration_suite") + c.Assert(err, IsNil) } func (s *testIntegrationSuite) TearDownSuite(c *C) { s.do.Close() c.Assert(s.store.Close(), IsNil) testleak.AfterTest(c)() + c.Assert(s.testData.GenerateOutputIfNeeded(), IsNil) } func (s *testIntegrationSuite) TestChangeVerTo2Behavior(c *C) { @@ -185,3 +189,28 @@ func (s *testIntegrationSuite) TestIncAnalyzeOnVer2(c *C) { "test t idx 1 4 4", )) } + +func (s *testIntegrationSuite) TestExpBackoffEstimation(c *C) { + defer cleanEnv(c, s.store, s.do) + tk := testkit.NewTestKit(c, s.store) + tk.MustExec("use test") + tk.MustExec("create table exp_backoff(a int, b int, c int, d int, index idx(a, b, c, d))") + tk.MustExec("insert into exp_backoff values(1, 1, 1, 1), (1, 1, 1, 2), (1, 1, 2, 3), (1, 2, 2, 4), (1, 2, 3, 5)") + tk.MustExec("set @@session.tidb_analyze_version=2") + tk.MustExec("analyze table exp_backoff") + var ( + input []string + output [][]string + ) + s.testData.GetTestCases(c, &input, &output) + // The test cases are: + // Query a = 1, b = 1, c = 1, d >= 3 and d <= 5 separately. We got 5, 3, 2, 3. + // And then query and a = 1 and b = 1 and c = 1 and d >= 3 and d <= 5. It's result should follow the exp backoff, + // which is 2/5 * (3/5)^{1/2} * (3/5)*{1/4} * 1^{1/8} * 5 = 1.3634. + for i := 0; i < len(input); i++ { + s.testData.OnRecord(func() { + output[i] = s.testData.ConvertRowsToStrings(tk.MustQuery(input[i]).Rows()) + }) + tk.MustQuery(input[i]).Check(testkit.Rows(output[i]...)) + } +} diff --git a/statistics/table.go b/statistics/table.go index 293899c286511..94f41879f06f0 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -339,7 +339,7 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx if idx.CMSketch != nil && idx.StatsVer == Version1 { result, err = coll.getIndexRowCount(sc, idxID, indexRanges) } else { - result, err = idx.GetRowCount(sc, indexRanges, coll.ModifyCount) + result, err = idx.GetRowCount(sc, coll, indexRanges, coll.ModifyCount) } result *= idx.GetIncreaseFactor(coll.Count) return result, errors.Trace(err) @@ -575,7 +575,7 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64 // on single-column index, use previous way as well, because CMSketch does not contain null // values in this case. if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { - count, err := idx.GetRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount) + count, err := idx.GetRowCount(sc, nil, []*ranger.Range{ran}, coll.ModifyCount) if err != nil { return 0, errors.Trace(err) } diff --git a/statistics/testdata/integration_suite_in.json b/statistics/testdata/integration_suite_in.json new file mode 100644 index 0000000000000..733a1203f0c7c --- /dev/null +++ b/statistics/testdata/integration_suite_in.json @@ -0,0 +1,12 @@ +[ + { + "name": "TestExpBackoffEstimation", + "cases": [ + "explain select * from exp_backoff where a = 1", + "explain select * from exp_backoff where b = 1", + "explain select * from exp_backoff where c = 1", + "explain select * from exp_backoff where d >= 3 and d <= 5", + "explain select * from exp_backoff where a = 1 and b = 1 and c = 1 and d >= 3 and d<= 5" + ] + } +] diff --git a/statistics/testdata/integration_suite_out.json b/statistics/testdata/integration_suite_out.json new file mode 100644 index 0000000000000..f8b3d60714869 --- /dev/null +++ b/statistics/testdata/integration_suite_out.json @@ -0,0 +1,30 @@ +[ + { + "Name": "TestExpBackoffEstimation", + "Cases": [ + [ + "IndexReader_6 5.00 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 5.00 cop[tikv] table:exp_backoff, index:idx(a, b, c, d) range:[1,1], keep order:false" + ], + [ + "TableReader_7 3.00 root data:Selection_6", + "└─Selection_6 3.00 cop[tikv] eq(test.exp_backoff.b, 1)", + " └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false" + ], + [ + "TableReader_7 2.00 root data:Selection_6", + "└─Selection_6 2.00 cop[tikv] eq(test.exp_backoff.c, 1)", + " └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false" + ], + [ + "TableReader_7 3.00 root data:Selection_6", + "└─Selection_6 3.00 cop[tikv] ge(test.exp_backoff.d, 3), le(test.exp_backoff.d, 5)", + " └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false" + ], + [ + "IndexReader_6 1.36 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 1.36 cop[tikv] table:exp_backoff, index:idx(a, b, c, d) range:[1 1 1 3,1 1 1 5], keep order:false" + ] + ] + } +] diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json index 6252ec6f4c446..8206784fd779f 100644 --- a/statistics/testdata/stats_suite_out.json +++ b/statistics/testdata/stats_suite_out.json @@ -318,8 +318,8 @@ " └─TableFullScan_5 4.00 cop[tikv] table:tdatetime keep order:false" ], [ - "TableReader_7 1.33 root data:Selection_6", - "└─Selection_6 1.33 cop[tikv] eq(test.tint.b, 1)", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tint.b, 1)", " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ @@ -328,8 +328,8 @@ " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ - "TableReader_7 1.01 root data:Selection_6", - "└─Selection_6 1.01 cop[tikv] eq(test.tint.b, 8)", + "TableReader_7 1.00 root data:Selection_6", + "└─Selection_6 1.00 cop[tikv] eq(test.tint.b, 8)", " └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false" ], [ @@ -405,8 +405,8 @@ "└─TableRangeScan_5 3.00 cop[tikv] table:ct1 range:[\"6\",\"8\"], keep order:false" ], [ - "TableReader_6 0.00 root data:TableRangeScan_5", - "└─TableRangeScan_5 0.00 cop[tikv] table:ct2 range:[1 1,1 8], keep order:false" + "TableReader_6 1.00 root data:TableRangeScan_5", + "└─TableRangeScan_5 1.00 cop[tikv] table:ct2 range:[1 1,1 8], keep order:false" ], [ "TableReader_6 1.00 root data:TableRangeScan_5", diff --git a/util/sqlexec/utils_test.go b/util/sqlexec/utils_test.go index d5dd0542b6bd5..a8a912a33978f 100644 --- a/util/sqlexec/utils_test.go +++ b/util/sqlexec/utils_test.go @@ -276,8 +276,8 @@ func (s *testUtilsSuite) TestEscapeSQL(c *C) { { name: "time 3", input: "select %?", - params: []interface{}{time.Unix(0, 888888888)}, - output: "select '1970-01-01 08:00:00.888888'", + params: []interface{}{time.Unix(0, 888888888).UTC()}, + output: "select '1970-01-01 00:00:00.888888'", err: "", }, {