From 5ff6bc19538357c2e72d4ad667f31258754c19a3 Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Mon, 13 Jul 2020 15:20:49 +0800 Subject: [PATCH 1/5] update a heuristic rule --- statistics/histogram.go | 8 ++++---- statistics/selectivity_test.go | 6 +++--- statistics/table.go | 23 +++++++++++++++++------ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index d0ca7c09c58f9..e5f56d4acf633 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -764,7 +764,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo return 0.0, nil } if c.NDV > 0 && c.outOfRange(val) { - return float64(modifyCount) / float64(c.NDV), nil + return outOfRangeEQSelectivity(c.NDV, modifyCount) * c.TotalRowCount(), nil } if c.CMSketch != nil { count, err := c.CMSketch.queryValue(sc, val) @@ -829,7 +829,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // The interval case. cnt := c.BetweenRowCount(lowVal, highVal) if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { - cnt += float64(modifyCount) / outOfRangeBetweenRate + cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * c.TotalRowCount() } // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. // Note that, `cnt` does not include null values, we need specially handle cases @@ -891,7 +891,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo } val := types.NewBytesDatum(b) if idx.NDV > 0 && idx.outOfRange(val) { - return float64(modifyCount) / (float64(idx.NDV)), nil + return outOfRangeEQSelectivity(idx.NDV, modifyCount) * idx.TotalRowCount(), nil } if idx.CMSketch != nil { return float64(idx.CMSketch.QueryBytes(b)), nil @@ -943,7 +943,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range totalCount += idx.BetweenRowCount(l, r) lowIsNull := bytes.Equal(lb, nullKeyBytes) if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - totalCount += float64(modifyCount) / outOfRangeBetweenRate + totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * idx.TotalRowCount() } if isSingleCol && lowIsNull { totalCount += float64(idx.NullCount) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 8a4e402f9c7f2..a2e32dc391cf0 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -396,15 +396,15 @@ func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { colID := table.Meta().Columns[0].ID count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 2.0) + c.Assert(count, Equals, 0.2) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30)) c.Assert(err, IsNil) - c.Assert(count, Equals, 4.2) + c.Assert(count, Equals, 2.4000000000000004) count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64)) c.Assert(err, IsNil) - c.Assert(count, Equals, 4.2) + c.Assert(count, Equals, 2.4000000000000004) idxID := table.Meta().Indices[0].ID count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30)) diff --git a/statistics/table.go b/statistics/table.go index d48a8206282b1..63335ce5b091a 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -392,6 +392,21 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool { return false } +func outOfRangeEQSelectivity(ndv, modifyRows int64) float64 { + // It must be 0 since the histogram contains the whole data if modifyRows is 0. + if modifyRows == 0 { + return 0 + } + // We simply set its selectivity to 1/NDV, and the magic number outOfRangeBetweenRate + // is used to avoid wrong selectivity caused by small NDV. + if ndv < outOfRangeBetweenRate { + ndv = outOfRangeBetweenRate + } + // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. + // Please see https://github.com/pingcap/tidb/issues/18461 for more details. + return 1 / float64(ndv) +} + // getEqualCondSelectivity gets the selectivity of the equal conditions. func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedColsLen int) float64 { coverAll := len(idx.Info.Columns) == usedColsLen @@ -404,8 +419,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols // When the value is out of range, we could not found this value in the CM Sketch, // so we use heuristic methods to estimate the selectivity. if idx.NDV > 0 && coverAll { - // for equality queries - return float64(coll.ModifyCount) / float64(idx.NDV) / idx.TotalRowCount() + return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount) } // The equal condition only uses prefix columns of the index. colIDs := coll.Idx2ColumnIDs[idx.ID] @@ -416,10 +430,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols } ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV) } - if ndv > 0 { - return float64(coll.ModifyCount) / float64(ndv) / idx.TotalRowCount() - } - return float64(coll.ModifyCount) / outOfRangeBetweenRate / idx.TotalRowCount() + return outOfRangeEQSelectivity(ndv, coll.ModifyCount) } return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount()) } From 394f5f1f44f8a6c10cb6f8c4eb2677a0b25748e5 Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Tue, 14 Jul 2020 16:00:48 +0800 Subject: [PATCH 2/5] fix CI --- planner/core/cbo_test.go | 4 ++-- planner/core/testdata/analyze_suite_out.json | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/planner/core/cbo_test.go b/planner/core/cbo_test.go index ac359deacc7ad..781952adf3e05 100644 --- a/planner/core/cbo_test.go +++ b/planner/core/cbo_test.go @@ -414,8 +414,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) { c.Assert(h.Update(dom.InfoSchema()), IsNil) statistics.RatioOfPseudoEstimate.Store(10.0) testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows( - "TableReader_7 35.91 root data:Selection_6", - "└─Selection_6 35.91 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)", + "TableReader_7 29.77 root data:Selection_6", + "└─Selection_6 29.77 cop[tikv] le(test.t.a, 5), le(test.t.b, 5)", " └─TableFullScan_5 80.00 cop[tikv] table:t keep order:false", )) statistics.RatioOfPseudoEstimate.Store(0.7) diff --git a/planner/core/testdata/analyze_suite_out.json b/planner/core/testdata/analyze_suite_out.json index 38ee91b1a89ac..0cfa239b2f3e6 100644 --- a/planner/core/testdata/analyze_suite_out.json +++ b/planner/core/testdata/analyze_suite_out.json @@ -347,17 +347,18 @@ { "SQL": "explain select * from t where a = 7639902", "Plan": [ - "IndexReader_6 2.03 root index:IndexRangeScan_5", - "└─IndexRangeScan_5 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" + "IndexReader_6 6.68 root index:IndexRangeScan_5", + "└─IndexRangeScan_5 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" ] }, { "SQL": "explain select c, b from t where a = 7639902 order by b asc limit 6", "Plan": [ - "Projection_7 2.03 root test.t.c, test.t.b", - "└─TopN_10 2.03 root test.t.b, offset:0, count:6", - " └─IndexReader_18 2.03 root index:IndexRangeScan_17", - " └─IndexRangeScan_17 2.03 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" + "Projection_7 6.00 root test.t.c, test.t.b", + "└─TopN_8 6.00 root test.t.b, offset:0, count:6", + " └─IndexReader_16 6.00 root index:TopN_15", + " └─TopN_15 6.00 cop[tikv] test.t.b, offset:0, count:6", + " └─IndexRangeScan_14 6.68 cop[tikv] table:t, index:PRIMARY(a, c, b) range:[7639902,7639902], keep order:false" ] } ] From b3b8bc7479db539c60f8cf9eb4d1aa42260b20bf Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Tue, 14 Jul 2020 16:07:51 +0800 Subject: [PATCH 3/5] fix CI --- cmd/explaintest/r/explain_union_scan.result | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cmd/explaintest/r/explain_union_scan.result b/cmd/explaintest/r/explain_union_scan.result index 45f9ef1e4ab1a..087f0d62efa21 100644 --- a/cmd/explaintest/r/explain_union_scan.result +++ b/cmd/explaintest/r/explain_union_scan.result @@ -16,16 +16,16 @@ Limit_20 10.00 root offset:0, count:10 └─HashJoin_22 10.00 root left outer join, equal:[eq(test.city.province_id, test.city.province_id)] ├─Limit_25(Build) 10.00 root offset:0, count:10 │ └─IndexJoin_38 10.00 root inner join, inner:UnionScan_37, outer key:test.city.id, inner key:test.city.id - │ ├─UnionScan_47(Build) 10.33 root - │ │ └─TableReader_49 10.33 root data:TableFullScan_48 - │ │ └─TableFullScan_48 10.33 cop[tikv] table:t2 keep order:false - │ └─UnionScan_37(Probe) 0.97 root gt(test.city.province_id, 1), lt(test.city.province_id, 100) - │ └─IndexLookUp_36 0.97 root + │ ├─UnionScan_47(Build) 10.00 root + │ │ └─TableReader_49 10.00 root data:TableFullScan_48 + │ │ └─TableFullScan_48 10.00 cop[tikv] table:t2 keep order:false + │ └─UnionScan_37(Probe) 1.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100) + │ └─IndexLookUp_36 1.00 root │ ├─IndexRangeScan_33(Build) 1.00 cop[tikv] table:t1, index:PRIMARY(id) range: decided by [eq(test.city.id, test.city.id)], keep order:false - │ └─Selection_35(Probe) 0.97 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100) + │ └─Selection_35(Probe) 1.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100) │ └─TableRowIDScan_34 1.00 cop[tikv] table:t1 keep order:false - └─UnionScan_57(Probe) 519304.44 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) - └─TableReader_60 519304.44 root data:Selection_59 - └─Selection_59 519304.44 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─UnionScan_57(Probe) 536284.00 root gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) + └─TableReader_60 536284.00 root data:Selection_59 + └─Selection_59 536284.00 cop[tikv] gt(test.city.province_id, 1), lt(test.city.province_id, 100), not(isnull(test.city.province_id)) └─TableFullScan_58 536284.00 cop[tikv] table:t3 keep order:false commit; From 413462a2d3caac50d1047e22e7cc014fdc8ce304 Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Tue, 14 Jul 2020 16:16:39 +0800 Subject: [PATCH 4/5] fix CI --- statistics/handle/update_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go index 742d7f7f80fbe..06196cb9f226f 100644 --- a/statistics/handle/update_test.go +++ b/statistics/handle/update_test.go @@ -1524,8 +1524,8 @@ func (s *testStatsSuite) TestAbnormalIndexFeedback(c *C) { sql: "select * from t where a = 2 and b > 10", hist: "column:2 ndv:20 totColSize:20\n" + "num: 5 lower_bound: -9223372036854775808 upper_bound: 7 repeats: 0\n" + - "num: 6 lower_bound: 7 upper_bound: 14 repeats: 0\n" + - "num: 7 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0", + "num: 4 lower_bound: 7 upper_bound: 14 repeats: 0\n" + + "num: 5 lower_bound: 14 upper_bound: 9223372036854775807 repeats: 0", rangeID: tblInfo.Columns[1].ID, idxID: tblInfo.Indices[0].ID, eqCount: 3, From 9eecd96d6b42bb25d9e3fb72ca92bc7e92c1b91b Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Mon, 27 Jul 2020 15:12:50 +0800 Subject: [PATCH 5/5] address comments --- statistics/histogram.go | 8 ++++---- statistics/selectivity_test.go | 28 ++++++++++++++++++++++++++++ statistics/table.go | 25 ++++++++++++++----------- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/statistics/histogram.go b/statistics/histogram.go index e5f56d4acf633..07d040db82f76 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -764,7 +764,7 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum, mo return 0.0, nil } if c.NDV > 0 && c.outOfRange(val) { - return outOfRangeEQSelectivity(c.NDV, modifyCount) * c.TotalRowCount(), nil + return outOfRangeEQSelectivity(c.NDV, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil } if c.CMSketch != nil { count, err := c.CMSketch.queryValue(sc, val) @@ -829,7 +829,7 @@ func (c *Column) GetColumnRowCount(sc *stmtctx.StatementContext, ranges []*range // The interval case. cnt := c.BetweenRowCount(lowVal, highVal) if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) { - cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * c.TotalRowCount() + cnt += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(c.TotalRowCount())) * c.TotalRowCount() } // `betweenRowCount` returns count for [l, h) range, we adjust cnt for boudaries here. // Note that, `cnt` does not include null values, we need specially handle cases @@ -891,7 +891,7 @@ func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte, modifyCo } val := types.NewBytesDatum(b) if idx.NDV > 0 && idx.outOfRange(val) { - return outOfRangeEQSelectivity(idx.NDV, modifyCount) * idx.TotalRowCount(), nil + return outOfRangeEQSelectivity(idx.NDV, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount(), nil } if idx.CMSketch != nil { return float64(idx.CMSketch.QueryBytes(b)), nil @@ -943,7 +943,7 @@ func (idx *Index) GetRowCount(sc *stmtctx.StatementContext, indexRanges []*range totalCount += idx.BetweenRowCount(l, r) lowIsNull := bytes.Equal(lb, nullKeyBytes) if (idx.outOfRange(l) && !(isSingleCol && lowIsNull)) || idx.outOfRange(r) { - totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount) * idx.TotalRowCount() + totalCount += outOfRangeEQSelectivity(outOfRangeBetweenRate, modifyCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() } if isSingleCol && lowIsNull { totalCount += float64(idx.NullCount) diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index a2e32dc391cf0..46f9bbd6096c1 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -370,6 +370,34 @@ func getRange(start, end int64) []*ranger.Range { return []*ranger.Range{ran} } +func (s *testStatsSuite) TestOutOfRangeEQEstimation(c *C) { + defer cleanEnv(c, s.store, s.do) + testKit := testkit.NewTestKit(c, s.store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int)") + for i := 0; i < 1000; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/4)) // 0 ~ 249 + } + testKit.MustExec("analyze table t") + + h := s.do.StatsHandle() + table, err := s.do.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + statsTbl := h.GetTableStats(table.Meta()) + sc := &stmtctx.StatementContext{} + col := statsTbl.Columns[table.Meta().Columns[0].ID] + count, err := col.GetColumnRowCount(sc, getRange(250, 250), 0, false) + c.Assert(err, IsNil) + c.Assert(count, Equals, float64(0)) + + for i := 0; i < 8; i++ { + count, err := col.GetColumnRowCount(sc, getRange(250, 250), int64(i+1), false) + c.Assert(err, IsNil) + c.Assert(count, Equals, math.Min(float64(i+1), 4)) // estRows must be less than modifyCnt + } +} + func (s *testStatsSuite) TestEstimationForUnknownValues(c *C) { defer cleanEnv(c, s.store, s.do) testKit := testkit.NewTestKit(c, s.store) diff --git a/statistics/table.go b/statistics/table.go index 63335ce5b091a..518da999b6695 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -392,19 +392,22 @@ func isSingleColIdxNullRange(idx *Index, ran *ranger.Range) bool { return false } -func outOfRangeEQSelectivity(ndv, modifyRows int64) float64 { - // It must be 0 since the histogram contains the whole data if modifyRows is 0. +// outOfRangeEQSelectivity estimates selectivities for out-of-range values. +// It assumes all modifications are insertions and all new-inserted rows are uniformly distributed +// and has the same distribution with analyzed rows, which means each unique value should have the +// same number of rows(Tot/NDV) of it. +func outOfRangeEQSelectivity(ndv, modifyRows, totalRows int64) float64 { if modifyRows == 0 { - return 0 + return 0 // it must be 0 since the histogram contains the whole data } - // We simply set its selectivity to 1/NDV, and the magic number outOfRangeBetweenRate - // is used to avoid wrong selectivity caused by small NDV. if ndv < outOfRangeBetweenRate { - ndv = outOfRangeBetweenRate + ndv = outOfRangeBetweenRate // avoid inaccurate selectivity caused by small NDV } - // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. - // Please see https://github.com/pingcap/tidb/issues/18461 for more details. - return 1 / float64(ndv) + selectivity := 1 / float64(ndv) // TODO: After extracting TopN from histograms, we can minus the TopN fraction here. + if selectivity*float64(totalRows) > float64(modifyRows) { + selectivity = float64(modifyRows) / float64(totalRows) + } + return selectivity } // getEqualCondSelectivity gets the selectivity of the equal conditions. @@ -419,7 +422,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols // When the value is out of range, we could not found this value in the CM Sketch, // so we use heuristic methods to estimate the selectivity. if idx.NDV > 0 && coverAll { - return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount) + return outOfRangeEQSelectivity(idx.NDV, coll.ModifyCount, int64(idx.TotalRowCount())) } // The equal condition only uses prefix columns of the index. colIDs := coll.Idx2ColumnIDs[idx.ID] @@ -430,7 +433,7 @@ func (coll *HistColl) getEqualCondSelectivity(idx *Index, bytes []byte, usedCols } ndv = mathutil.MaxInt64(ndv, coll.Columns[colID].NDV) } - return outOfRangeEQSelectivity(ndv, coll.ModifyCount) + return outOfRangeEQSelectivity(ndv, coll.ModifyCount, int64(idx.TotalRowCount())) } return float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.TotalRowCount()) }