diff --git a/cmd/explaintest/r/explain_complex_stats.result b/cmd/explaintest/r/explain_complex_stats.result index 541b62b1dc595..c67d87b15af5b 100644 --- a/cmd/explaintest/r/explain_complex_stats.result +++ b/cmd/explaintest/r/explain_complex_stats.result @@ -158,11 +158,11 @@ Projection_5 39.28 root test.st.cm, test.st.p1, test.st.p2, test.st.p3, test.st. └─TableScan_14 160.23 cop table:st, keep order:false explain select dt.id as id, dt.aid as aid, dt.pt as pt, dt.dic as dic, dt.cm as cm, rr.gid as gid, rr.acd as acd, rr.t as t,dt.p1 as p1, dt.p2 as p2, dt.p3 as p3, dt.p4 as p4, dt.p5 as p5, dt.p6_md5 as p6, dt.p7_md5 as p7 from dt dt join rr rr on (rr.pt = 'ios' and rr.t > 1478185592 and dt.aid = rr.aid and dt.dic = rr.dic) where dt.pt = 'ios' and dt.t > 1478185592 and dt.bm = 0 limit 2000; id count task operator info -Projection_9 428.32 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5 -└─Limit_12 428.32 root offset:0, count:2000 - └─IndexJoin_18 428.32 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic - ├─TableReader_42 428.32 root data:Selection_41 - │ └─Selection_41 428.32 cop eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0) +Projection_9 428.55 root dt.id, dt.aid, dt.pt, dt.dic, dt.cm, rr.gid, rr.acd, rr.t, dt.p1, dt.p2, dt.p3, dt.p4, dt.p5, dt.p6_md5, dt.p7_md5 +└─Limit_12 428.55 root offset:0, count:2000 + └─IndexJoin_18 428.55 root inner join, inner:IndexLookUp_17, outer key:dt.aid, dt.dic, inner key:rr.aid, rr.dic + ├─TableReader_42 428.55 root data:Selection_41 + │ └─Selection_41 428.55 cop eq(dt.pt, "ios"), gt(dt.t, 1478185592), eq(dt.bm, 0) │ └─TableScan_40 2000.00 cop table:dt, range:[0,+inf], keep order:false └─IndexLookUp_17 970.00 root ├─IndexScan_14 1.00 cop table:rr, index:aid, dic, range: decided by [dt.aid dt.dic], keep order:false diff --git a/cmd/explaintest/r/explain_easy_stats.result b/cmd/explaintest/r/explain_easy_stats.result index 5e90853e1903e..3e314cb6c1054 100644 --- a/cmd/explaintest/r/explain_easy_stats.result +++ b/cmd/explaintest/r/explain_easy_stats.result @@ -47,10 +47,10 @@ explain select * from t1 left join t2 on t1.c2 = t2.c1 where t1.c1 > 1; id count task operator info Projection_6 2481.25 root test.t1.c1, test.t1.c2, test.t1.c3, test.t2.c1, test.t2.c2 └─MergeJoin_7 2481.25 root left outer join, left key:test.t1.c2, right key:test.t2.c1 - ├─IndexLookUp_17 1998.00 root - │ ├─Selection_16 1998.00 cop gt(test.t1.c1, 1) + ├─IndexLookUp_17 1999.00 root + │ ├─Selection_16 1999.00 cop gt(test.t1.c1, 1) │ │ └─IndexScan_14 1999.00 cop table:t1, index:c2, range:[NULL,+inf], keep order:true - │ └─TableScan_15 1998.00 cop table:t1, keep order:false + │ └─TableScan_15 1999.00 cop table:t1, keep order:false └─IndexLookUp_21 1985.00 root ├─IndexScan_19 1985.00 cop table:t2, index:c1, range:[NULL,+inf], keep order:true └─TableScan_20 1985.00 cop table:t2, keep order:false diff --git a/plan/cbo_test.go b/plan/cbo_test.go index 077df9b317b0d..6e90a8a5ec863 100644 --- a/plan/cbo_test.go +++ b/plan/cbo_test.go @@ -496,8 +496,8 @@ func (s *testAnalyzeSuite) TestOutdatedAnalyze(c *C) { c.Assert(h.Update(dom.InfoSchema()), IsNil) statistics.RatioOfPseudoEstimate = 10.0 testKit.MustQuery("explain select * from t where a <= 5 and b <= 5").Check(testkit.Rows( - "TableReader_7 28.80 root data:Selection_6", - "└─Selection_6 28.80 cop le(test.t.a, 5), le(test.t.b, 5)", + "TableReader_7 35.91 root data:Selection_6", + "└─Selection_6 35.91 cop le(test.t.a, 5), le(test.t.b, 5)", " └─TableScan_5 80.00 cop table:t, range:[-inf,+inf], keep order:false", )) statistics.RatioOfPseudoEstimate = 0.7 diff --git a/statistics/boostrap.go b/statistics/boostrap.go index 9f80e14a17d4e..170754c9214a7 100644 --- a/statistics/boostrap.go +++ b/statistics/boostrap.go @@ -40,15 +40,15 @@ func initStatsMeta4Chunk(is infoschema.InfoSchema, tables statsCache, iter *chun TableID: tableInfo.ID, HaveTblID: true, Count: row.GetInt64(3), + ModifyCount: row.GetInt64(2), Columns: make(map[int64]*Column, len(tableInfo.Columns)), Indices: make(map[int64]*Index, len(tableInfo.Indices)), colName2Idx: make(map[string]int64, len(tableInfo.Columns)), colName2ID: make(map[string]int64, len(tableInfo.Columns)), } tbl := &Table{ - HistColl: newHistColl, - ModifyCount: row.GetInt64(2), - Version: row.GetUint64(0), + HistColl: newHistColl, + Version: row.GetUint64(0), } tables[tableID] = tbl } diff --git a/statistics/ddl_test.go b/statistics/ddl_test.go index 92317973de2a2..3e060e2cf8c1e 100644 --- a/statistics/ddl_test.go +++ b/statistics/ddl_test.go @@ -126,7 +126,7 @@ func (s *testStatsCacheSuite) TestDDLHistogram(c *C) { c.Assert(count, Equals, float64(2)) count, err = statsTbl.ColumnEqualRowCount(sc, types.NewIntDatum(1), tableInfo.Columns[3].ID) c.Assert(err, IsNil) - c.Assert(count, Equals, float64(0)) + c.Assert(count, Equals, float64(2)) testKit.MustExec("alter table t add column c4 datetime NOT NULL default CURRENT_TIMESTAMP") err = h.HandleDDLEvent(<-h.DDLEventCh()) diff --git a/statistics/dump.go b/statistics/dump.go index a3b08c9499b63..74aa854336cf9 100644 --- a/statistics/dump.go +++ b/statistics/dump.go @@ -122,15 +122,15 @@ func (h *Handle) LoadStatsFromJSON(is infoschema.InfoSchema, jsonTbl *JSONTable) // LoadStatsFromJSONToTable load statistic from JSONTable and return the Table of statistic. func (h *Handle) LoadStatsFromJSONToTable(tableInfo *model.TableInfo, jsonTbl *JSONTable) (*Table, error) { newHistColl := HistColl{ - TableID: tableInfo.ID, - HaveTblID: true, - Count: jsonTbl.Count, - Columns: make(map[int64]*Column, len(jsonTbl.Columns)), - Indices: make(map[int64]*Index, len(jsonTbl.Indices)), + TableID: tableInfo.ID, + HaveTblID: true, + Count: jsonTbl.Count, + ModifyCount: jsonTbl.ModifyCount, + Columns: make(map[int64]*Column, len(jsonTbl.Columns)), + Indices: make(map[int64]*Index, len(jsonTbl.Indices)), } tbl := &Table{ - HistColl: newHistColl, - ModifyCount: jsonTbl.ModifyCount, + HistColl: newHistColl, } for id, jsonIdx := range jsonTbl.Indices { for _, idxInfo := range tableInfo.Indices { diff --git a/statistics/feedback.go b/statistics/feedback.go index e3bbb1e924727..d663c29e945aa 100644 --- a/statistics/feedback.go +++ b/statistics/feedback.go @@ -714,11 +714,11 @@ func (q *QueryFeedback) recalculateExpectCount(h *Handle) error { expected := 0.0 if isIndex { idx := t.Indices[id] - expected, err = idx.getRowCount(sc, ranges) + expected, err = idx.getRowCount(sc, ranges, t.ModifyCount) expected *= idx.getIncreaseFactor(t.Count) } else { c := t.Columns[id] - expected, err = c.getColumnRowCount(sc, ranges) + expected, err = c.getColumnRowCount(sc, ranges, t.ModifyCount) expected *= c.getIncreaseFactor(t.Count) } if err != nil { diff --git a/statistics/histogram.go b/statistics/histogram.go index 70aa356e9ecbb..13195dc15fd89 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -669,6 +669,15 @@ func (hg *Histogram) AvgCountPerValue(totalCount int64) float64 { return float64(totalCount) / curNDV } +func (hg *Histogram) outOfRange(val types.Datum) bool { + if hg.Bounds == nil { + return true + } + len := hg.Bounds.NumRows() + return chunk.Compare(hg.Bounds.GetRow(0), 0, &val) > 0 || + chunk.Compare(hg.Bounds.GetRow(len-1), 0, &val) < 0 +} + // ErrorRate is the error rate of estimate row count by bucket and cm sketch. type ErrorRate struct { ErrorTotal float64 @@ -716,19 +725,22 @@ func (c *Column) equalRowCount(sc *stmtctx.StatementContext, val types.Datum) (f if val.IsNull() { return float64(c.NullCount), nil } - if c.CMSketch != nil { - count, err := c.CMSketch.queryValue(sc, val) - return float64(count), errors.Trace(err) - } // all the values is null if c.Histogram.Bounds == nil { return 0.0, nil } + if c.NDV > 0 && c.outOfRange(val) { + return c.totalRowCount() / (float64(c.NDV)), nil + } + if c.CMSketch != nil { + count, err := c.CMSketch.queryValue(sc, val) + return float64(count), errors.Trace(err) + } return c.Histogram.equalRowCount(val), nil } // getColumnRowCount estimates the row count by a slice of Range. -func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range) (float64, error) { +func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*ranger.Range, modifyCount int64) (float64, error) { var rowCount float64 for _, rg := range ranges { cmp, err := rg.LowVal[0].CompareDatum(sc, &rg.HighVal[0]) @@ -749,6 +761,9 @@ func (c *Column) getColumnRowCount(sc *stmtctx.StatementContext, ranges []*range } // the interval case. cnt := c.betweenRowCount(rg.LowVal[0], rg.HighVal[0]) + if c.outOfRange(rg.LowVal[0]) || c.outOfRange(rg.HighVal[0]) { + cnt += float64(modifyCount) / outOfRangeBetweenRate + } if rg.LowExclude { lowCnt, err := c.equalRowCount(sc, rg.LowVal[0]) if err != nil { @@ -787,13 +802,17 @@ func (idx *Index) String() string { } func (idx *Index) equalRowCount(sc *stmtctx.StatementContext, b []byte) float64 { + val := types.NewBytesDatum(b) + if idx.NDV > 0 && idx.outOfRange(val) { + return idx.totalRowCount() / (float64(idx.NDV)) + } if idx.CMSketch != nil { return float64(idx.CMSketch.QueryBytes(b)) } - return idx.Histogram.equalRowCount(types.NewBytesDatum(b)) + return idx.Histogram.equalRowCount(val) } -func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range) (float64, error) { +func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { totalCount := float64(0) for _, indexRange := range indexRanges { lb, err := codec.EncodeKey(sc, nil, indexRange.LowVal...) @@ -820,6 +839,9 @@ func (idx *Index) getRowCount(sc *stmtctx.StatementContext, indexRanges []*range l := types.NewBytesDatum(lb) r := types.NewBytesDatum(rb) totalCount += idx.betweenRowCount(l, r) + if idx.outOfRange(l) || idx.outOfRange(r) { + totalCount += float64(modifyCount) / outOfRangeBetweenRate + } } if totalCount > idx.totalRowCount() { totalCount = idx.totalRowCount() diff --git a/statistics/selectivity_test.go b/statistics/selectivity_test.go index 5762896b7abf6..797544b222716 100644 --- a/statistics/selectivity_test.go +++ b/statistics/selectivity_test.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/codec" + "github.com/pingcap/tidb/util/ranger" "github.com/pingcap/tidb/util/testkit" ) @@ -157,7 +158,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) { }, { exprs: "a >= 1 and b > 1 and a < 2", - selectivity: 0.01783264746, + selectivity: 0.01817558299, }, { exprs: "a >= 1 and c > 1 and a < 2", @@ -173,7 +174,7 @@ func (s *testSelectivitySuite) TestSelectivity(c *C) { }, { exprs: "b > 1", - selectivity: 0.96296296296, + selectivity: 0.98148148148, }, { exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5", @@ -238,6 +239,59 @@ func (s *testSelectivitySuite) TestDiscreteDistribution(c *C) { "└─IndexScan_8 0.00 cop table:t, index:a, b, range:[\"tw\" -inf,\"tw\" 0), keep order:false")) } +func getRange(start, end int64) []*ranger.Range { + ran := &ranger.Range{ + LowVal: []types.Datum{types.NewIntDatum(start)}, + HighVal: []types.Datum{types.NewIntDatum(end)}, + } + return []*ranger.Range{ran} +} + +func (s *testSelectivitySuite) TestEstimationForUnknownValues(c *C) { + testKit := testkit.NewTestKit(c, s.store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, b int, key idx(a, b))") + testKit.MustExec("analyze table t") + for i := 0; i < 10; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i, i)) + } + h := s.dom.StatsHandle() + h.DumpStatsDeltaToKV(statistics.DumpAll) + testKit.MustExec("analyze table t") + for i := 0; i < 10; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d, %d)", i+10, i+10)) + } + h.DumpStatsDeltaToKV(statistics.DumpAll) + c.Assert(h.Update(s.dom.InfoSchema()), IsNil) + table, err := s.dom.InfoSchema().TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + c.Assert(err, IsNil) + statsTbl := h.GetTableStats(table.Meta()) + + sc := &stmtctx.StatementContext{} + colID := table.Meta().Columns[0].ID + count, err := statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(30, 30)) + c.Assert(err, IsNil) + c.Assert(count, Equals, 2.0) + + count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, 30)) + c.Assert(err, IsNil) + c.Assert(count, Equals, 4.2) + + count, err = statsTbl.GetRowCountByColumnRanges(sc, colID, getRange(9, math.MaxInt64)) + c.Assert(err, IsNil) + c.Assert(count, Equals, 4.2) + + idxID := table.Meta().Indices[0].ID + count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(30, 30)) + c.Assert(err, IsNil) + c.Assert(count, Equals, 0.2) + + count, err = statsTbl.GetRowCountByIndexRanges(sc, idxID, getRange(9, 30)) + c.Assert(err, IsNil) + c.Assert(count, Equals, 2.2) +} + func BenchmarkSelectivity(b *testing.B) { c := &C{} s := &testSelectivitySuite{} diff --git a/statistics/table.go b/statistics/table.go index 65f8deea33c08..88e0d74cf1791 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -38,14 +38,15 @@ const ( pseudoEqualRate = 1000 pseudoLessRate = 3 pseudoBetweenRate = 40 + + outOfRangeBetweenRate = 100 ) // Table represents statistics for a table. type Table struct { HistColl - ModifyCount int64 // Total modify count in a table. - Version uint64 - PKIsHandle bool + Version uint64 + PKIsHandle bool } // HistColl is a collection of histogram. It collects enough information for plan to calculate the selectivity. @@ -58,6 +59,7 @@ type HistColl struct { colName2ID map[string]int64 // map column name to column id Pseudo bool Count int64 + ModifyCount int64 // Total modify count in a table. } func (t *Table) copy() *Table { @@ -70,6 +72,7 @@ func (t *Table) copy() *Table { colName2Idx: make(map[string]int64), colName2ID: make(map[string]int64), Pseudo: t.Pseudo, + ModifyCount: t.ModifyCount, } for id, col := range t.Columns { newHistColl.Columns[id] = col @@ -84,9 +87,8 @@ func (t *Table) copy() *Table { newHistColl.colName2ID[name] = id } nt := &Table{ - HistColl: newHistColl, - ModifyCount: t.ModifyCount, - Version: t.Version, + HistColl: newHistColl, + Version: t.Version, } return nt } @@ -406,7 +408,7 @@ func (coll *HistColl) GetRowCountByIntColumnRanges(sc *stmtctx.StatementContext, return getPseudoRowCountByUnsignedIntRanges(intRanges, float64(coll.Count)), nil } c := coll.Columns[colID] - result, err := c.getColumnRowCount(sc, intRanges) + result, err := c.getColumnRowCount(sc, intRanges, coll.ModifyCount) result *= c.getIncreaseFactor(coll.Count) return result, errors.Trace(err) } @@ -418,7 +420,7 @@ func (coll *HistColl) GetRowCountByColumnRanges(sc *stmtctx.StatementContext, co return getPseudoRowCountByColumnRanges(sc, float64(coll.Count), colRanges, 0) } c := coll.Columns[colID] - result, err := c.getColumnRowCount(sc, colRanges) + result, err := c.getColumnRowCount(sc, colRanges, coll.ModifyCount) result *= c.getIncreaseFactor(coll.Count) return result, errors.Trace(err) } @@ -436,9 +438,9 @@ func (coll *HistColl) GetRowCountByIndexRanges(sc *stmtctx.StatementContext, idx var result float64 var err error if idx.CMSketch != nil && idx.statsVer == version1 { - result, err = coll.getIndexRowCount(sc, idx, indexRanges) + result, err = coll.getIndexRowCount(sc, idx, indexRanges, coll.ModifyCount) } else { - result, err = idx.getRowCount(sc, indexRanges) + result, err = idx.getRowCount(sc, indexRanges, coll.ModifyCount) } result *= idx.getIncreaseFactor(coll.Count) return result, errors.Trace(err) @@ -465,13 +467,13 @@ func getOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int return len(ran.LowVal) } -func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idx *Index, indexRanges []*ranger.Range) (float64, error) { +func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idx *Index, indexRanges []*ranger.Range, modifyCount int64) (float64, error) { totalCount := float64(0) for _, ran := range indexRanges { rangePosition := getOrdinalOfRangeCond(sc, ran) // first one is range, just use the previous way to estimate if rangePosition == 0 { - count, err := idx.getRowCount(sc, []*ranger.Range{ran}) + count, err := idx.getRowCount(sc, []*ranger.Range{ran}, modifyCount) if err != nil { return 0, errors.Trace(err) } @@ -484,7 +486,20 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idx *Index, if err != nil { return 0, errors.Trace(err) } - selectivity = float64(idx.CMSketch.QueryBytes(bytes)) / float64(coll.Count) + val := types.NewBytesDatum(bytes) + if idx.outOfRange(val) { + // When the value is out of range, we could not found this value in the CM Sketch, + // so we use heuristic methods to estimate the selectivity. + if idx.NDV > 0 && len(ran.LowVal) == len(idx.Info.Columns) && rangePosition == len(ran.LowVal) { + // for equality queries + selectivity = 1.0 / float64(idx.NDV) + } else { + // for range queries + selectivity = float64(modifyCount) / outOfRangeBetweenRate / idx.totalRowCount() + } + } else { + selectivity = float64(idx.CMSketch.QueryBytes(bytes)) / float64(idx.totalRowCount()) + } // use histogram to estimate the range condition if rangePosition != len(ran.LowVal) { rang := ranger.Range{ @@ -505,9 +520,9 @@ func (coll *HistColl) getIndexRowCount(sc *stmtctx.StatementContext, idx *Index, if err != nil { return 0, errors.Trace(err) } - selectivity = selectivity * count / float64(coll.Count) + selectivity = selectivity * count / float64(idx.totalRowCount()) } - totalCount += selectivity * float64(coll.Count) + totalCount += selectivity * float64(idx.totalRowCount()) } if totalCount > idx.totalRowCount() { totalCount = idx.totalRowCount() diff --git a/statistics/update_test.go b/statistics/update_test.go index 8a8d7ea0c3d57..79c4a07b49db3 100644 --- a/statistics/update_test.go +++ b/statistics/update_test.go @@ -596,7 +596,7 @@ func (s *testStatsUpdateSuite) TestQueryFeedback(c *C) { } // Feedback from limit executor may not be accurate. - testKit.MustQuery("select * from t where t.a <= 2 limit 1") + testKit.MustQuery("select * from t where t.a <= 5 limit 1") h.DumpStatsDeltaToKV(statistics.DumpAll) feedback := h.GetQueryFeedback() c.Assert(len(feedback), Equals, 0)