Merge branch 'master' into snapshot-fail-test

pingcap · Feb 24, 2021 · 16581f0 · 16581f0
2 parents ba9706e + 6d6c833
commit 16581f0
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 16 deletions.
diff --git a/statistics/handle/update.go b/statistics/handle/update.go
@@ -1137,7 +1137,7 @@ func (h *Handle) RecalculateExpectCount(q *statistics.QueryFeedback) error {
 	expected := 0.0
 	if isIndex {
 		idx := t.Indices[id]
-		expected, err = idx.GetRowCount(sc, ranges, t.ModifyCount)
+		expected, err = idx.GetRowCount(sc, nil, ranges, t.ModifyCount)
 		expected *= idx.GetIncreaseFactor(t.Count)
 	} else {
 		c := t.Columns[id]

diff --git a/statistics/histogram.go b/statistics/histogram.go
@@ -1189,7 +1189,7 @@ func (idx *Index) QueryBytes(d []byte) uint64 {
 
 // GetRowCount returns the row count of the given ranges.
 // It uses the modifyCount to adjust the influence of modifications on the table.
-func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
+func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, coll *HistColl, indexRanges []*ranger.Range, modifyCount int64) (float64, error) {
 	totalCount := float64(0)
 	isSingleCol := len(idx.Info.Columns) == 1
 	for _, indexRange := range indexRanges {
@@ -1226,21 +1226,89 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range
 		}
 		l := types.NewBytesDatum(lb)
 		r := types.NewBytesDatum(rb)
-		totalCount += idx.BetweenRowCount(l, r)
 		lowIsNull := bytes.Equal(lb, nullKeyBytes)
 		if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) {
 			totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount()
 		}
 		if isSingleCol && lowIsNull {
 			totalCount += float64(idx.NullCount)
 		}
+		// Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything.
+		// If the first column's range is point.
+		if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer == Version2 && coll != nil {
+			expBackoffSel, err := idx.expBackoffEstimation(sc, coll, indexRange)
+			if err != nil {
+				return 0, err
+			}
+			totalCount += expBackoffSel * idx.TotalRowCount()
+		} else {
+			totalCount += idx.BetweenRowCount(l, r)
+		}
 	}
 	if totalCount > idx.TotalRowCount() {
 		totalCount = idx.TotalRowCount()
 	}
 	return totalCount, nil
 }
 
+// expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details.
+func (idx *Index) expBackoffEstimation(sc *stmtctx.StatementContext, coll *HistColl, indexRange *ranger.Range) (float64, error) {
+	tmpRan := []*ranger.Range{
+		{
+			LowVal:  make([]types.Datum, 1),
+			HighVal: make([]types.Datum, 1),
+		},
+	}
+	colsIDs := coll.Idx2ColumnIDs[idx.ID]
+	singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal))
+	// The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like:
+	//   1. Calc the selectivity of each column.
+	//   2. Sort them and choose the first 4 most selective filter and the corresponding selectivity is sel_1, sel_2, sel_3, sel_4 where i < j => sel_i < sel_j.
+	//   3. The final selectivity would be sel_1 * sel_2^{1/2} * sel_3^{1/4} * sel_4^{1/8}.
+	// This calculation reduced the independence assumption and can work well better than it.
+	for i := 0; i < len(indexRange.LowVal); i++ {
+		tmpRan[0].LowVal[0] = indexRange.LowVal[i]
+		tmpRan[0].HighVal[0] = indexRange.HighVal[i]
+		if i == len(indexRange.LowVal)-1 {
+			tmpRan[0].LowExclude = indexRange.LowExclude
+			tmpRan[0].HighExclude = indexRange.HighExclude
+		}
+		colID := colsIDs[i]
+		var (
+			count float64
+			err   error
+		)
+		if anotherIdxID, ok := coll.ColID2IdxID[colID]; ok && anotherIdxID != idx.ID {
+			count, err = coll.GetRowCountByIndexRanges(sc, anotherIdxID, tmpRan)
+		} else if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sc, coll.Pseudo) {
+			count, err = coll.GetRowCountByColumnRanges(sc, colID, tmpRan)
+		} else {
+			continue
+		}
+		if err != nil {
+			return 0, err
+		}
+		singleColumnEstResults = append(singleColumnEstResults, count)
+	}
+	// Sort them.
+	sort.Slice(singleColumnEstResults, func(i, j int) bool {
+		return singleColumnEstResults[i] < singleColumnEstResults[j]
+	})
+	l := len(singleColumnEstResults)
+	// Convert the first 4 to selectivity results.
+	for i := 0; i < l && i < 4; i++ {
+		singleColumnEstResults[i] = singleColumnEstResults[i] / float64(coll.Count)
+	}
+	if l == 1 {
+		return singleColumnEstResults[0], nil
+	} else if l == 2 {
+		return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]), nil
+	} else if l == 3 {
+		return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) * math.Sqrt(math.Sqrt(singleColumnEstResults[2])), nil
+	}
+	return singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))), nil
+}
+
 type countByRangeFunc = func(*stmtctx.StatementContext, int64, []*ranger.Range) (float64, error)
 
 // newHistogramBySelectivity fulfills the content of new histogram by the given selectivity result.

diff --git a/statistics/integration_test.go b/statistics/integration_test.go
@@ -19,27 +19,31 @@ import (
 	"github.com/pingcap/tidb/kv"
 	"github.com/pingcap/tidb/util/testkit"
 	"github.com/pingcap/tidb/util/testleak"
+	"github.com/pingcap/tidb/util/testutil"
 )
 
 var _ = Suite(&testIntegrationSuite{})
 
 type testIntegrationSuite struct {
-	store kv.Storage
-	do    *domain.Domain
+	store    kv.Storage
+	do       *domain.Domain
+	testData testutil.TestData
 }
 
 func (s *testIntegrationSuite) SetUpSuite(c *C) {
 	testleak.BeforeTest()
-	// Add the hook here to avoid data race.
 	var err error
 	s.store, s.do, err = newStoreWithBootstrap()
 	c.Assert(err, IsNil)
+	s.testData, err = testutil.LoadTestSuiteData("testdata", "integration_suite")
+	c.Assert(err, IsNil)
 }
 
 func (s *testIntegrationSuite) TearDownSuite(c *C) {
 	s.do.Close()
 	c.Assert(s.store.Close(), IsNil)
 	testleak.AfterTest(c)()
+	c.Assert(s.testData.GenerateOutputIfNeeded(), IsNil)
 }
 
 func (s *testIntegrationSuite) TestChangeVerTo2Behavior(c *C) {
@@ -185,3 +189,28 @@ func (s *testIntegrationSuite) TestIncAnalyzeOnVer2(c *C) {
 		"test t  idx 1 4 4",
 	))
 }
+
+func (s *testIntegrationSuite) TestExpBackoffEstimation(c *C) {
+	defer cleanEnv(c, s.store, s.do)
+	tk := testkit.NewTestKit(c, s.store)
+	tk.MustExec("use test")
+	tk.MustExec("create table exp_backoff(a int, b int, c int, d int, index idx(a, b, c, d))")
+	tk.MustExec("insert into exp_backoff values(1, 1, 1, 1), (1, 1, 1, 2), (1, 1, 2, 3), (1, 2, 2, 4), (1, 2, 3, 5)")
+	tk.MustExec("set @@session.tidb_analyze_version=2")
+	tk.MustExec("analyze table exp_backoff")
+	var (
+		input  []string
+		output [][]string
+	)
+	s.testData.GetTestCases(c, &input, &output)
+	// The test cases are:
+	// Query a = 1, b = 1, c = 1, d >= 3 and d <= 5 separately. We got 5, 3, 2, 3.
+	// And then query and a = 1 and b = 1 and c = 1 and d >= 3 and d <= 5. It's result should follow the exp backoff,
+	// which is 2/5 * (3/5)^{1/2} * (3/5)*{1/4} * 1^{1/8} * 5 = 1.3634.
+	for i := 0; i < len(input); i++ {
+		s.testData.OnRecord(func() {
+			output[i] = s.testData.ConvertRowsToStrings(tk.MustQuery(input[i]).Rows())
+		})
+		tk.MustQuery(input[i]).Check(testkit.Rows(output[i]...))
+	}
+}
diff --git a/statistics/table.go b/statistics/table.go
@@ -339,7 +339,7 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx
 	if idx.CMSketch != nil && idx.StatsVer == Version1 {
 		result, err = coll.getIndexRowCount(sc, idxID, indexRanges)
 	} else {
-		result, err = idx.GetRowCount(sc, indexRanges, coll.ModifyCount)
+		result, err = idx.GetRowCount(sc, coll, indexRanges, coll.ModifyCount)
 	}
 	result *= idx.GetIncreaseFactor(coll.Count)
 	return result, errors.Trace(err)
@@ -575,7 +575,7 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idxID int64
 		// on single-column index, use previous way as well, because CMSketch does not contain null
 		// values in this case.
 		if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
-			count, err := idx.GetRowCount(sc, []*ranger.Range{ran}, coll.ModifyCount)
+			count, err := idx.GetRowCount(sc, nil, []*ranger.Range{ran}, coll.ModifyCount)
 			if err != nil {
 				return 0, errors.Trace(err)
 			}

diff --git a/statistics/testdata/integration_suite_in.json b/statistics/testdata/integration_suite_in.json
@@ -0,0 +1,12 @@
+[
+  {
+    "name": "TestExpBackoffEstimation",
+    "cases": [
+      "explain select * from exp_backoff where a = 1",
+      "explain select * from exp_backoff where b = 1",
+      "explain select * from exp_backoff where c = 1",
+      "explain select * from exp_backoff where d >= 3 and d <= 5",
+      "explain select * from exp_backoff where a = 1 and b = 1 and c = 1 and d >= 3 and d<= 5"
+    ]
+  }
+]
diff --git a/statistics/testdata/integration_suite_out.json b/statistics/testdata/integration_suite_out.json
@@ -0,0 +1,30 @@
+[
+  {
+    "Name": "TestExpBackoffEstimation",
+    "Cases": [
+      [
+        "IndexReader_6 5.00 root  index:IndexRangeScan_5",
+        "└─IndexRangeScan_5 5.00 cop[tikv] table:exp_backoff, index:idx(a, b, c, d) range:[1,1], keep order:false"
+      ],
+      [
+        "TableReader_7 3.00 root  data:Selection_6",
+        "└─Selection_6 3.00 cop[tikv]  eq(test.exp_backoff.b, 1)",
+        "  └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false"
+      ],
+      [
+        "TableReader_7 2.00 root  data:Selection_6",
+        "└─Selection_6 2.00 cop[tikv]  eq(test.exp_backoff.c, 1)",
+        "  └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false"
+      ],
+      [
+        "TableReader_7 3.00 root  data:Selection_6",
+        "└─Selection_6 3.00 cop[tikv]  ge(test.exp_backoff.d, 3), le(test.exp_backoff.d, 5)",
+        "  └─TableFullScan_5 5.00 cop[tikv] table:exp_backoff keep order:false"
+      ],
+      [
+        "IndexReader_6 1.36 root  index:IndexRangeScan_5",
+        "└─IndexRangeScan_5 1.36 cop[tikv] table:exp_backoff, index:idx(a, b, c, d) range:[1 1 1 3,1 1 1 5], keep order:false"
+      ]
+    ]
+  }
+]
diff --git a/statistics/testdata/stats_suite_out.json b/statistics/testdata/stats_suite_out.json
@@ -318,8 +318,8 @@
         "  └─TableFullScan_5 4.00 cop[tikv] table:tdatetime keep order:false"
       ],
       [
-        "TableReader_7 1.33 root  data:Selection_6",
-        "└─Selection_6 1.33 cop[tikv]  eq(test.tint.b, 1)",
+        "TableReader_7 1.00 root  data:Selection_6",
+        "└─Selection_6 1.00 cop[tikv]  eq(test.tint.b, 1)",
         "  └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false"
       ],
       [
@@ -328,8 +328,8 @@
         "  └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false"
       ],
       [
-        "TableReader_7 1.01 root  data:Selection_6",
-        "└─Selection_6 1.01 cop[tikv]  eq(test.tint.b, 8)",
+        "TableReader_7 1.00 root  data:Selection_6",
+        "└─Selection_6 1.00 cop[tikv]  eq(test.tint.b, 8)",
         "  └─TableFullScan_5 8.00 cop[tikv] table:tint keep order:false"
       ],
       [
@@ -405,8 +405,8 @@
         "└─TableRangeScan_5 3.00 cop[tikv] table:ct1 range:[\"6\",\"8\"], keep order:false"
       ],
       [
-        "TableReader_6 0.00 root  data:TableRangeScan_5",
-        "└─TableRangeScan_5 0.00 cop[tikv] table:ct2 range:[1 1,1 8], keep order:false"
+        "TableReader_6 1.00 root  data:TableRangeScan_5",
+        "└─TableRangeScan_5 1.00 cop[tikv] table:ct2 range:[1 1,1 8], keep order:false"
       ],
       [
         "TableReader_6 1.00 root  data:TableRangeScan_5",

diff --git a/util/sqlexec/utils_test.go b/util/sqlexec/utils_test.go
@@ -276,8 +276,8 @@ func (s *testUtilsSuite) TestEscapeSQL(c *C) {
 		{
 			name:   "time 3",
 			input:  "select %?",
-			params: []interface{}{time.Unix(0, 888888888)},
-			output: "select '1970-01-01 08:00:00.888888'",
+			params: []interface{}{time.Unix(0, 888888888).UTC()},
+			output: "select '1970-01-01 00:00:00.888888'",
 			err:    "",
 		},
 		{