diff --git a/planner/cardinality/BUILD.bazel b/planner/cardinality/BUILD.bazel index 218d57ac26e1d..b0442e6c9e594 100644 --- a/planner/cardinality/BUILD.bazel +++ b/planner/cardinality/BUILD.bazel @@ -59,7 +59,7 @@ go_test( data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 31, + shard_count = 32, deps = [ "//config", "//domain", diff --git a/planner/cardinality/row_count_column.go b/planner/cardinality/row_count_column.go index 137a3950724d0..32fbab9d922f8 100644 --- a/planner/cardinality/row_count_column.go +++ b/planner/cardinality/row_count_column.go @@ -282,9 +282,13 @@ func GetColumnRowCount(sctx sessionctx.Context, c *statistics.Column, ranges []* // If the current table row count has changed, we should scale the row count accordingly. cnt *= c.GetIncreaseFactor(realtimeRowCount) + histNDV := c.NDV + if c.StatsVer == statistics.Version2 { + histNDV = histNDV - int64(c.TopN.Num()) + } // handling the out-of-range part if (c.OutOfRange(lowVal) && !lowVal.IsNull()) || c.OutOfRange(highVal) { - cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount) + cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount, histNDV) } if debugTrace { diff --git a/planner/cardinality/row_count_index.go b/planner/cardinality/row_count_index.go index 2aa3045bad496..3b463f51fd1c8 100644 --- a/planner/cardinality/row_count_index.go +++ b/planner/cardinality/row_count_index.go @@ -320,9 +320,13 @@ func getIndexRowCountForStatsV2(sctx sessionctx.Context, idx *statistics.Index, // If the current table row count has changed, we should scale the row count accordingly. count *= idx.GetIncreaseFactor(realtimeRowCount) + histNDV := idx.NDV + if idx.StatsVer == statistics.Version2 { + histNDV = histNDV - int64(idx.TopN.Num()) + } // handling the out-of-range part if (outOfRangeOnIndex(idx, l) && !(isSingleCol && lowIsNull)) || outOfRangeOnIndex(idx, r) { - count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount) + count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV) } if debugTrace { diff --git a/planner/cardinality/row_count_test.go b/planner/cardinality/row_count_test.go index a3d4745c80769..6cfeef7818d39 100644 --- a/planner/cardinality/row_count_test.go +++ b/planner/cardinality/row_count_test.go @@ -33,7 +33,7 @@ func TestPseudoTable(t *testing.T) { State: model.StatePublic, } ti.Columns = append(ti.Columns, colInfo) - tbl := statistics.PseudoTable(ti) + tbl := statistics.PseudoTable(ti, false) require.Len(t, tbl.Columns, 1) require.Greater(t, tbl.RealtimeCount, int64(0)) sctx := mock.NewContext() @@ -50,7 +50,7 @@ func TestPseudoTable(t *testing.T) { Hidden: true, State: model.StatePublic, }) - tbl = statistics.PseudoTable(ti) + tbl = statistics.PseudoTable(ti, false) // We added a hidden column. The pseudo table still only have one column. require.Equal(t, len(tbl.Columns), 1) } diff --git a/planner/cardinality/selectivity_test.go b/planner/cardinality/selectivity_test.go index 77f34a9a355aa..0f897386553f4 100644 --- a/planner/cardinality/selectivity_test.go +++ b/planner/cardinality/selectivity_test.go @@ -173,11 +173,15 @@ func TestOutOfRangeEstimationAfterDelete(t *testing.T) { testKit.MustExec("drop table if exists t") testKit.MustExec("create table t(a int unsigned)") require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + // [300, 900) + // 5 rows for each value, 3000 rows in total. for i := 0; i < 3000; i++ { - testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) // [300, 900) + testKit.MustExec(fmt.Sprintf("insert into t values (%v)", i/5+300)) } require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll)) testKit.MustExec("analyze table t with 1 samplerate, 0 topn") + // Data in [300, 500), 1000 rows in total, are deleted. + // 2000 rows left. testKit.MustExec("delete from t where a < 500") require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll)) require.Nil(t, h.Update(dom.InfoSchema())) @@ -193,9 +197,15 @@ func TestOutOfRangeEstimationAfterDelete(t *testing.T) { for i := range input { testdata.OnRecord(func() { output[i].SQL = input[i] - output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows()) }) - testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...)) + if strings.HasPrefix(input[i], "explain") { + testdata.OnRecord(func() { + output[i].Result = testdata.ConvertRowsToStrings(testKit.MustQuery(input[i]).Rows()) + }) + testKit.MustQuery(input[i]).Check(testkit.Rows(output[i].Result...)) + } else { + testKit.MustExec(input[i]) + } } } @@ -1321,3 +1331,71 @@ func TestCrossValidationSelectivity(t *testing.T) { "└─Selection 0.00 cop[tikv] gt(test.t.c, 1000)", " └─TableRangeScan 2.00 cop[tikv] table:t range:(1 0,1 1000), keep order:false")) } + +func TestIgnoreRealtimeStats(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + testKit := testkit.NewTestKit(t, store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, b int, index ib(b))") + h := dom.StatsHandle() + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + + // 1. Insert 11 rows of data without ANALYZE. + testKit.MustExec("insert into t values(1,1),(1,2),(1,3),(1,4),(1,5),(2,1),(2,2),(2,3),(2,4),(2,5),(3,1)") + require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + require.Nil(t, h.Update(dom.InfoSchema())) + + // 1-1. use real-time stats. + // From the real-time stats, we are able to know the total count is 11. + testKit.MustExec("set @@tidb_opt_objective = 'moderate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows( + "TableReader_7 0.00 root data:Selection_6", + "└─Selection_6 0.00 cop[tikv] eq(test.t.a, 1), gt(test.t.b, 2)", + " └─TableFullScan_5 11.00 cop[tikv] table:t keep order:false, stats:pseudo", + )) + + // 1-2. ignore real-time stats. + // Use pseudo stats table. The total row count is 10000. + testKit.MustExec("set @@tidb_opt_objective = 'determinate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows( + "TableReader_7 3.33 root data:Selection_6", + "└─Selection_6 3.33 cop[tikv] eq(test.t.a, 1), gt(test.t.b, 2)", + " └─TableFullScan_5 10000.00 cop[tikv] table:t keep order:false, stats:pseudo", + )) + + // 2. After ANALYZE. + testKit.MustExec("analyze table t with 1 samplerate") + require.Nil(t, h.Update(dom.InfoSchema())) + + // The execution plans are the same no matter we ignore the real-time stats or not. + analyzedPlan := []string{ + "TableReader_7 2.73 root data:Selection_6", + "└─Selection_6 2.73 cop[tikv] eq(test.t.a, 1), gt(test.t.b, 2)", + " └─TableFullScan_5 11.00 cop[tikv] table:t keep order:false", + } + testKit.MustExec("set @@tidb_opt_objective = 'moderate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...)) + testKit.MustExec("set @@tidb_opt_objective = 'determinate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...)) + + // 3. Insert another 4 rows of data. + testKit.MustExec("insert into t values(3,2),(3,3),(3,4),(3,5)") + require.Nil(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + require.Nil(t, h.Update(dom.InfoSchema())) + + // 3-1. use real-time stats. + // From the real-time stats, we are able to know the total count is 15. + // Selectivity is not changed: 15 * (2.73 / 11) = 3.72 + testKit.MustExec("set @@tidb_opt_objective = 'moderate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows( + "TableReader_7 3.72 root data:Selection_6", + "└─Selection_6 3.72 cop[tikv] eq(test.t.a, 1), gt(test.t.b, 2)", + " └─TableFullScan_5 15.00 cop[tikv] table:t keep order:false", + )) + + // 3-2. ignore real-time stats. + // The execution plan is the same as case 2. + testKit.MustExec("set @@tidb_opt_objective = 'determinate'") + testKit.MustQuery("explain select * from t where a = 1 and b > 2").Check(testkit.Rows(analyzedPlan...)) +} diff --git a/planner/cardinality/testdata/cardinality_suite_in.json b/planner/cardinality/testdata/cardinality_suite_in.json index 3673a1b93e360..9560538250e76 100644 --- a/planner/cardinality/testdata/cardinality_suite_in.json +++ b/planner/cardinality/testdata/cardinality_suite_in.json @@ -111,7 +111,12 @@ "explain format = 'brief' select * from t where a > 900 and a < 1000", "explain format = 'brief' select * from t where a > 900 and a < 1100", "explain format = 'brief' select * from t where a > 200 and a < 300", - "explain format = 'brief' select * from t where a > 100 and a < 300" + "explain format = 'brief' select * from t where a > 100 and a < 300", + "set @@tidb_opt_objective = 'determinate'", + "explain format = 'brief' select * from t where a <= 300", + "explain format = 'brief' select * from t where a <= 500", + "explain format = 'brief' select * from t where a > 900", + "explain format = 'brief' select * from t where a <= 900" ] }, { diff --git a/planner/cardinality/testdata/cardinality_suite_out.json b/planner/cardinality/testdata/cardinality_suite_out.json index cf05b47517f17..7568b66ea8ca4 100644 --- a/planner/cardinality/testdata/cardinality_suite_out.json +++ b/planner/cardinality/testdata/cardinality_suite_out.json @@ -234,6 +234,42 @@ "└─Selection 832.49 cop[tikv] gt(test.t.a, 100), lt(test.t.a, 300)", " └─TableFullScan 2000.00 cop[tikv] table:t keep order:false" ] + }, + { + "SQL": "set @@tidb_opt_objective = 'determinate'", + "Result": null + }, + { + "SQL": "explain format = 'brief' select * from t where a <= 300", + "Result": [ + "TableReader 10.00 root data:Selection", + "└─Selection 10.00 cop[tikv] le(test.t.a, 300)", + " └─TableFullScan 3000.00 cop[tikv] table:t keep order:false" + ] + }, + { + "SQL": "explain format = 'brief' select * from t where a <= 500", + "Result": [ + "TableReader 1010.00 root data:Selection", + "└─Selection 1010.00 cop[tikv] le(test.t.a, 500)", + " └─TableFullScan 3000.00 cop[tikv] table:t keep order:false" + ] + }, + { + "SQL": "explain format = 'brief' select * from t where a > 900", + "Result": [ + "TableReader 5.00 root data:Selection", + "└─Selection 5.00 cop[tikv] gt(test.t.a, 900)", + " └─TableFullScan 3000.00 cop[tikv] table:t keep order:false" + ] + }, + { + "SQL": "explain format = 'brief' select * from t where a <= 900", + "Result": [ + "TableReader 3000.00 root data:Selection", + "└─Selection 3000.00 cop[tikv] le(test.t.a, 900)", + " └─TableFullScan 3000.00 cop[tikv] table:t keep order:false" + ] } ] }, diff --git a/planner/core/logical_plan_builder.go b/planner/core/logical_plan_builder.go index fa633e1357b7b..67a8595a49d0d 100644 --- a/planner/core/logical_plan_builder.go +++ b/planner/core/logical_plan_builder.go @@ -4695,6 +4695,7 @@ func (ds *DataSource) AddExtraPhysTblIDColumn() *expression.Column { // 1. tidb-server started and statistics handle has not been initialized. // 2. table row count from statistics is zero. // 3. statistics is outdated. +// Note: please also update getLatestVersionFromStatsTable() when logic in this function changes. func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) *statistics.Table { statsHandle := domain.GetDomain(ctx).StatsHandle() var usePartitionStats, countIs0, pseudoStatsForUninitialized, pseudoStatsForOutdated bool @@ -4717,7 +4718,7 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) } // 1. tidb-server started and statistics handle has not been initialized. if statsHandle == nil { - return statistics.PseudoTable(tblInfo) + return statistics.PseudoTable(tblInfo, false) } if pid == tblInfo.ID || ctx.GetSessionVars().StmtCtx.UseDynamicPartitionPrune() { @@ -4727,11 +4728,35 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) statsTbl = statsHandle.GetPartitionStats(tblInfo, pid, cache.WithTableStatsByQuery()) } + allowPseudoTblTriggerLoading := false + // In OptObjectiveDeterminate mode, we need to ignore the real-time stats. + // To achieve this, we copy the statsTbl and reset the real-time stats fields (set ModifyCount to 0 and set + // RealtimeCount to the row count from the ANALYZE, which is fetched from loaded stats in GetAnalyzeRowCount()). + if ctx.GetSessionVars().GetOptObjective() == variable.OptObjectiveDeterminate { + analyzeCount := max(int64(statsTbl.GetAnalyzeRowCount()), 0) + // If the two fields are already the values we want, we don't need to modify it, and also we don't need to copy. + if statsTbl.RealtimeCount != analyzeCount || statsTbl.ModifyCount != 0 { + // Here is a case that we need specially care about: + // The original stats table from the stats cache is not a pseudo table, but the analyze row count is 0 (probably + // because of no col/idx stats are loaded), which will makes it a pseudo table according to the rule 2 below. + // Normally, a pseudo table won't trigger stats loading since we assume it means "no stats available", but + // in such case, we need it able to trigger stats loading. + // That's why we use the special allowPseudoTblTriggerLoading flag here. + if !statsTbl.Pseudo && statsTbl.RealtimeCount > 0 && analyzeCount == 0 { + allowPseudoTblTriggerLoading = true + } + // Copy it so we can modify the ModifyCount and the RealtimeCount safely. + statsTbl = statsTbl.ShallowCopy() + statsTbl.RealtimeCount = analyzeCount + statsTbl.ModifyCount = 0 + } + } + // 2. table row count from statistics is zero. if statsTbl.RealtimeCount == 0 { countIs0 = true core_metrics.PseudoEstimationNotAvailable.Inc() - return statistics.PseudoTable(tblInfo) + return statistics.PseudoTable(tblInfo, allowPseudoTblTriggerLoading) } // 3. statistics is uninitialized or outdated. @@ -4751,6 +4776,44 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) return statsTbl } +// getLatestVersionFromStatsTable gets statistics information for a table specified by "tableID", and get the max +// LastUpdateVersion among all Columns and Indices in it. +// Its overall logic is quite similar to getStatsTable(). During plan cache matching, only the latest version is needed. +// In such case, compared to getStatsTable(), this function can save some copies, memory allocations and unnecessary +// checks. Also, this function won't trigger metrics changes. +func getLatestVersionFromStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) (version uint64) { + statsHandle := domain.GetDomain(ctx).StatsHandle() + // 1. tidb-server started and statistics handle has not been initialized. Pseudo stats table. + if statsHandle == nil { + return 0 + } + + var statsTbl *statistics.Table + if pid == tblInfo.ID || ctx.GetSessionVars().StmtCtx.UseDynamicPartitionPrune() { + statsTbl = statsHandle.GetTableStats(tblInfo, cache.WithTableStatsByQuery()) + } else { + statsTbl = statsHandle.GetPartitionStats(tblInfo, pid, cache.WithTableStatsByQuery()) + } + + // 2. Table row count from statistics is zero. Pseudo stats table. + realtimeRowCount := statsTbl.RealtimeCount + if ctx.GetSessionVars().GetOptObjective() == variable.OptObjectiveDeterminate { + realtimeRowCount = max(int64(statsTbl.GetAnalyzeRowCount()), 0) + } + if realtimeRowCount == 0 { + return 0 + } + + // 3. Not pseudo stats table. Return the max LastUpdateVersion among all Columns and Indices + for _, col := range statsTbl.Columns { + version = max(version, col.LastUpdateVersion) + } + for _, idx := range statsTbl.Indices { + version = max(version, idx.LastUpdateVersion) + } + return version +} + func (b *PlanBuilder) tryBuildCTE(ctx context.Context, tn *ast.TableName, asName *model.CIStr) (LogicalPlan, error) { for i := len(b.outerCTEs) - 1; i >= 0; i-- { cte := b.outerCTEs[i] diff --git a/planner/core/plan_cache_utils.go b/planner/core/plan_cache_utils.go index e5e83befbcd77..dcb6fbda5a124 100644 --- a/planner/core/plan_cache_utils.go +++ b/planner/core/plan_cache_utils.go @@ -36,7 +36,6 @@ import ( "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/sessionctx/stmtctx" "github.com/pingcap/tidb/sessionctx/variable" - "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/types" driver "github.com/pingcap/tidb/types/parser_driver" "github.com/pingcap/tidb/util/codec" @@ -478,24 +477,6 @@ func GetPreparedStmt(stmt *ast.ExecuteStmt, vars *variable.SessionVars) (*PlanCa return nil, ErrStmtNotFound } -func tableStatsVersionForPlanCache(tStats *statistics.Table) (tableStatsVer uint64) { - if tStats == nil { - return 0 - } - // use the max version of all columns and indices as the table stats version - for _, col := range tStats.Columns { - if col.LastUpdateVersion > tableStatsVer { - tableStatsVer = col.LastUpdateVersion - } - } - for _, idx := range tStats.Indices { - if idx.LastUpdateVersion > tableStatsVer { - tableStatsVer = idx.LastUpdateVersion - } - } - return tableStatsVer -} - // GetMatchOpts get options to fetch plan or generate new plan // we can add more options here func GetMatchOpts(sctx sessionctx.Context, is infoschema.InfoSchema, stmt *PlanCacheStmt, params []expression.Expression) (*utilpc.PlanCacheMatchOpts, error) { @@ -508,8 +489,7 @@ func GetMatchOpts(sctx sessionctx.Context, is infoschema.InfoSchema, stmt *PlanC if err != nil { // CTE in this case continue } - tStats := getStatsTable(sctx, t.Meta(), t.Meta().ID) - statsVerHash += tableStatsVersionForPlanCache(tStats) // use '+' as the hash function for simplicity + statsVerHash += getLatestVersionFromStatsTable(sctx, t.Meta(), t.Meta().ID) // use '+' as the hash function for simplicity } for _, node := range stmt.QueryFeatures.limits { diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go index d1eaa5106671d..7dd6dcd16dc8a 100644 --- a/planner/core/planbuilder.go +++ b/planner/core/planbuilder.go @@ -1807,10 +1807,10 @@ func (b *PlanBuilder) buildPhysicalIndexLookUpReader(_ context.Context, dbName m Ranges: ranger.FullRange(), physicalTableID: physicalID, isPartition: isPartition, - tblColHists: &(statistics.PseudoTable(tblInfo)).HistColl, + tblColHists: &(statistics.PseudoTable(tblInfo, false)).HistColl, }.Init(b.ctx, b.getSelectOffset()) // There is no alternative plan choices, so just use pseudo stats to avoid panic. - is.SetStats(&property.StatsInfo{HistColl: &(statistics.PseudoTable(tblInfo)).HistColl}) + is.SetStats(&property.StatsInfo{HistColl: &(statistics.PseudoTable(tblInfo, false)).HistColl}) if hasCommonCols { for _, c := range commonInfos { is.Columns = append(is.Columns, c.ColumnInfo) @@ -1826,7 +1826,7 @@ func (b *PlanBuilder) buildPhysicalIndexLookUpReader(_ context.Context, dbName m DBName: dbName, physicalTableID: physicalID, isPartition: isPartition, - tblColHists: &(statistics.PseudoTable(tblInfo)).HistColl, + tblColHists: &(statistics.PseudoTable(tblInfo, false)).HistColl, }.Init(b.ctx, b.getSelectOffset()) ts.SetSchema(idxColSchema) ts.Columns = ExpandVirtualColumn(ts.Columns, ts.schema, ts.Table.Columns) diff --git a/planner/core/stats.go b/planner/core/stats.go index 042a7c52cff7f..19da203c027f7 100644 --- a/planner/core/stats.go +++ b/planner/core/stats.go @@ -68,7 +68,7 @@ func (p *LogicalMemTable) DeriveStats(_ []*property.StatsInfo, selfSchema *expre if p.StatsInfo() != nil { return p.StatsInfo(), nil } - statsTable := statistics.PseudoTable(p.TableInfo) + statsTable := statistics.PseudoTable(p.TableInfo, false) stats := &property.StatsInfo{ RowCount: float64(statsTable.RealtimeCount), ColNDVs: make(map[int64]float64, len(p.TableInfo.Columns)), diff --git a/server/testdata/optimizer_suite_out.json b/server/testdata/optimizer_suite_out.json index 932a3a4886fbb..4ec5fac8998a5 100644 --- a/server/testdata/optimizer_suite_out.json +++ b/server/testdata/optimizer_suite_out.json @@ -26,56 +26,6 @@ "KindInt64 127" ] }, - { - "github.com/pingcap/tidb/planner/core.getStatsTable": { - "CountIsZero": false, - "HandleIsNil": false, - "InputPhysicalID": 100, - "Outdated": false, - "StatsTblInfo": { - "Columns": [ - { - "CMSketchInfo": null, - "Correlation": 0, - "HistogramSize": 0, - "ID": 1, - "LastUpdateVersion": 440930000000000000, - "LoadingStatus": "unInitialized", - "NDV": 0, - "Name": "col1", - "NullCount": 0, - "StatsVer": 0, - "TopNSize": -1, - "TotColSize": 0 - } - ], - "Count": 10000, - "Indexes": [ - { - "CMSketchInfo": null, - "Correlation": 0, - "HistogramSize": 0, - "ID": 1, - "LastUpdateVersion": 440930000000000000, - "LoadingStatus": "unInitialized", - "NDV": 0, - "Name": "i", - "NullCount": 0, - "StatsVer": 0, - "TopNSize": -1, - "TotColSize": 0 - } - ], - "ModifyCount": 0, - "PhysicalID": 100, - "Version": 440930000000000000 - }, - "TableName": "t", - "TblInfoID": 100, - "Uninitialized": true, - "UsePartitionStats": false - } - }, { "github.com/pingcap/tidb/planner.Optimize": [ { @@ -502,56 +452,6 @@ "KindInt64 1" ] }, - { - "github.com/pingcap/tidb/planner/core.getStatsTable": { - "CountIsZero": false, - "HandleIsNil": false, - "InputPhysicalID": 100, - "Outdated": false, - "StatsTblInfo": { - "Columns": [ - { - "CMSketchInfo": null, - "Correlation": 0, - "HistogramSize": 0, - "ID": 1, - "LastUpdateVersion": 440930000000000000, - "LoadingStatus": "unInitialized", - "NDV": 0, - "Name": "col1", - "NullCount": 0, - "StatsVer": 0, - "TopNSize": -1, - "TotColSize": 0 - } - ], - "Count": 10000, - "Indexes": [ - { - "CMSketchInfo": null, - "Correlation": 0, - "HistogramSize": 0, - "ID": 1, - "LastUpdateVersion": 440930000000000000, - "LoadingStatus": "unInitialized", - "NDV": 0, - "Name": "i", - "NullCount": 0, - "StatsVer": 0, - "TopNSize": -1, - "TotColSize": 0 - } - ], - "ModifyCount": 0, - "PhysicalID": 100, - "Version": 440930000000000000 - }, - "TableName": "t", - "TblInfoID": 100, - "Uninitialized": true, - "UsePartitionStats": false - } - }, { "github.com/pingcap/tidb/planner.Optimize": [ { diff --git a/sessionctx/variable/session.go b/sessionctx/variable/session.go index f14cb85b4f919..9d4fcabeceb8b 100644 --- a/sessionctx/variable/session.go +++ b/sessionctx/variable/session.go @@ -1537,6 +1537,12 @@ type SessionVars struct { // SessionAlias is the identifier of the session SessionAlias string + + // OptObjective indicates whether the optimizer should be more stable, predictable or more aggressive. + // For now, the possible values and corresponding behaviors are: + // OptObjectiveModerate: The default value. The optimizer considers the real-time stats (real-time row count, modify count). + // OptObjectiveDeterminate: The optimizer doesn't consider the real-time stats. + OptObjective string } // GetOptimizerFixControlMap returns the specified value of the optimizer fix control. @@ -3621,3 +3627,17 @@ func RuntimeFilterModeStringToMode(name string) (RuntimeFilterMode, bool) { return -1, false } } + +const ( + // OptObjectiveModerate is a possible value and the default value for TiDBOptObjective. + // Please see comments of SessionVars.OptObjective for details. + OptObjectiveModerate string = "moderate" + // OptObjectiveDeterminate is a possible value for TiDBOptObjective. + OptObjectiveDeterminate = "determinate" +) + +// GetOptObjective return the session variable "tidb_opt_objective". +// Please see comments of SessionVars.OptObjective for details. +func (s *SessionVars) GetOptObjective() string { + return s.OptObjective +} diff --git a/sessionctx/variable/sysvar.go b/sessionctx/variable/sysvar.go index 912786848a8dc..4d09634ede2c9 100644 --- a/sessionctx/variable/sysvar.go +++ b/sessionctx/variable/sysvar.go @@ -2796,6 +2796,17 @@ var defaultSysVars = []*SysVar{ }, GetSession: func(vars *SessionVars) (string, error) { return vars.SessionAlias, nil }}, + { + Scope: ScopeGlobal | ScopeSession, + Name: TiDBOptObjective, + Value: DefTiDBOptObjective, + Type: TypeEnum, + PossibleValues: []string{OptObjectiveModerate, OptObjectiveDeterminate}, + SetSession: func(vars *SessionVars, s string) error { + vars.OptObjective = s + return nil + }, + }, } func setTiFlashComputeDispatchPolicy(s *SessionVars, val string) error { diff --git a/sessionctx/variable/tidb_vars.go b/sessionctx/variable/tidb_vars.go index 31a8a80d1f607..c227a2050e3c4 100644 --- a/sessionctx/variable/tidb_vars.go +++ b/sessionctx/variable/tidb_vars.go @@ -904,6 +904,10 @@ const ( // TiDBEnableCheckConstraint indicates whether to enable check constraint feature. TiDBEnableCheckConstraint = "tidb_enable_check_constraint" + + // TiDBOptObjective indicates whether the optimizer should be more stable, predictable or more aggressive. + // Please see comments of SessionVars.OptObjective for details. + TiDBOptObjective = "tidb_opt_objective" ) // TiDB vars that have only global scope @@ -1389,6 +1393,7 @@ const ( DefTiDBLockUnchangedKeys = true DefTiDBEnableCheckConstraint = false DefTiDBSkipMissingPartitionStats = true + DefTiDBOptObjective = OptObjectiveModerate ) // Process global variables. diff --git a/statistics/column.go b/statistics/column.go index c1eb50cd49237..cd4025546a6e9 100644 --- a/statistics/column.go +++ b/statistics/column.go @@ -38,6 +38,9 @@ type Column struct { // StatsLoadedStatus indicates the status of column statistics StatsLoadedStatus + // PhysicalID is the physical table id, + // or it could possibly be -1, which means "stats not available". + // The -1 case could happen in a pseudo stats table, and in this case, this stats should not trigger stats loading. PhysicalID int64 Flag int64 StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility @@ -160,23 +163,25 @@ func (c *Column) IsInvalid( debugtrace.LeaveContextCommon(sctx) }() } - if collPseudo { - inValidForCollPseudo = true - return true - } if sctx != nil { stmtctx := sctx.GetSessionVars().StmtCtx - if c.IsLoadNeeded() && stmtctx != nil { + if (!c.IsStatsInitialized() || c.IsLoadNeeded()) && stmtctx != nil { if stmtctx.StatsLoad.Timeout > 0 { logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.", zap.String(strconv.FormatInt(c.Info.ID, 10), c.Info.Name.O)) } // In some tests, the c.Info is not set, so we add this check here. - if c.Info != nil { + // When we are using stats from PseudoTable(), the table ID will possibly be -1. + // In this case, we don't trigger stats loading. + if c.Info != nil && c.PhysicalID > 0 { HistogramNeededItems.insert(model.TableItemID{TableID: c.PhysicalID, ID: c.Info.ID, IsIndex: false}) } } } + if collPseudo { + inValidForCollPseudo = true + return true + } // In some cases, some statistics in column would be evicted // For example: the cmsketch of the column might be evicted while the histogram and the topn are still exists // In this case, we will think this column as valid due to we can still use the rest of the statistics to do optimize. diff --git a/statistics/handle/handle.go b/statistics/handle/handle.go index df5d9eeb34b29..ff02216ee78d0 100644 --- a/statistics/handle/handle.go +++ b/statistics/handle/handle.go @@ -735,7 +735,7 @@ func (h *Handle) GetTableStats(tblInfo *model.TableInfo, opts ...cache.TableStat func (h *Handle) GetPartitionStats(tblInfo *model.TableInfo, pid int64, opts ...cache.TableStatsOpt) *statistics.Table { var tbl *statistics.Table if h == nil { - tbl = statistics.PseudoTable(tblInfo) + tbl = statistics.PseudoTable(tblInfo, false) tbl.PhysicalID = pid return tbl } @@ -751,7 +751,7 @@ func (h *Handle) GetPartitionStats(tblInfo *model.TableInfo, pid int64, opts ... tbl, ok = statsCache.GetFromInternal(pid) } if !ok { - tbl = statistics.PseudoTable(tblInfo) + tbl = statistics.PseudoTable(tblInfo, false) tbl.PhysicalID = pid if tblInfo.GetPartitionInfo() == nil || h.statsCacheLen() < 64 { h.updateStatsCache(statsCache, []*statistics.Table{tbl}, nil) diff --git a/statistics/handle/update.go b/statistics/handle/update.go index 54cf4e562080c..e505058014d1a 100644 --- a/statistics/handle/update.go +++ b/statistics/handle/update.go @@ -615,7 +615,7 @@ func NeedAnalyzeTable(tbl *statistics.Table, _ time.Duration, autoAnalyzeRatio f } // No need to analyze it. tblCnt := float64(tbl.RealtimeCount) - if histCnt := tbl.GetColRowCount(); histCnt > 0 { + if histCnt := tbl.GetAnalyzeRowCount(); histCnt > 0 { tblCnt = histCnt } if float64(tbl.ModifyCount)/tblCnt <= autoAnalyzeRatio { diff --git a/statistics/histogram.go b/statistics/histogram.go index 41573ed2987df..9f03266213382 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -849,7 +849,7 @@ func (hg *Histogram) OutOfRange(val types.Datum) bool { │ │ lDatum rDatum */ -func (hg *Histogram) OutOfRangeRowCount(sctx sessionctx.Context, lDatum, rDatum *types.Datum, modifyCount int64) (result float64) { +func (hg *Histogram) OutOfRangeRowCount(sctx sessionctx.Context, lDatum, rDatum *types.Datum, modifyCount, histNDV int64) (result float64) { debugTrace := sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace if debugTrace { debugtrace.EnterContextCommon(sctx) @@ -969,14 +969,31 @@ func (hg *Histogram) OutOfRangeRowCount(sctx sessionctx.Context, lDatum, rDatum } rowCount = totalPercent * hg.NotNullCount() + // Upper bound logic + + allowUseModifyCount := sctx.GetSessionVars().GetOptObjective() != variable.OptObjectiveDeterminate // Use the modifyCount as the upper bound. Note that modifyCount contains insert, delete and update. So this is // a rather loose upper bound. // There are some scenarios where we need to handle out-of-range estimation after both insert and delete happen. // But we don't know how many increases are in the modifyCount. So we have to use this loose bound to ensure it // can produce a reasonable results in this scenario. - if rowCount > float64(modifyCount) { + if rowCount > float64(modifyCount) && allowUseModifyCount { return float64(modifyCount) } + + // In OptObjectiveDeterminate mode, we can't rely on the modify count anymore. + // An upper bound is necessary to make the estimation make sense for predicates with bound on only one end, like a > 1. + // But it's impossible to have a reliable upper bound in all cases. + // We use 1/NDV here (only the Histogram part is considered) and it seems reasonable and good enough for now. + if !allowUseModifyCount { + var upperBound float64 + if histNDV > 0 { + upperBound = hg.NotNullCount() / float64(histNDV) + } + if rowCount > upperBound { + return upperBound + } + } return rowCount } diff --git a/statistics/index.go b/statistics/index.go index c50508b4d536f..7c496544f594e 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -33,8 +33,11 @@ type Index struct { Info *model.IndexInfo Histogram StatsLoadedStatus - StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility - Flag int64 + StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility + Flag int64 + // PhysicalID is the physical table id, + // or it could possibly be -1, which means "stats not available". + // The -1 case could happen in a pseudo stats table, and in this case, this stats should not trigger stats loading. PhysicalID int64 } @@ -127,9 +130,7 @@ func (idx *Index) TotalRowCount() float64 { // IsInvalid checks if this index is invalid. func (idx *Index) IsInvalid(sctx sessionctx.Context, collPseudo bool) (res bool) { - if !collPseudo { - idx.CheckStats() - } + idx.CheckStats() var totalCount float64 if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { debugtrace.EnterContextCommon(sctx) diff --git a/statistics/table.go b/statistics/table.go index 14e9f181aa62e..97e7ab71e30ee 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -28,6 +28,7 @@ import ( "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/ranger" "go.uber.org/atomic" + "golang.org/x/exp/maps" ) const ( @@ -308,6 +309,29 @@ func (t *Table) Copy() *Table { return nt } +// ShallowCopy copies the current table. +// It's different from Copy(). Only the struct Table (and also the embedded HistColl) is copied here. +// The internal containers, like t.Columns and t.Indices, and the stats, like TopN and Histogram are not copied. +func (t *Table) ShallowCopy() *Table { + newHistColl := HistColl{ + PhysicalID: t.PhysicalID, + HavePhysicalID: t.HavePhysicalID, + RealtimeCount: t.RealtimeCount, + Columns: t.Columns, + Indices: t.Indices, + Pseudo: t.Pseudo, + ModifyCount: t.ModifyCount, + } + nt := &Table{ + HistColl: newHistColl, + Version: t.Version, + Name: t.Name, + TblInfoUpdateTS: t.TblInfoUpdateTS, + ExtendedStats: t.ExtendedStats, + } + return nt +} + // String implements Stringer interface. func (t *Table) String() string { strs := make([]string, 0, len(t.Columns)+1) @@ -370,20 +394,25 @@ func (t *Table) GetStatsInfo(id int64, isIndex bool) (*Histogram, *CMSketch, *To return nil, nil, nil, nil, false } -// GetColRowCount tries to get the row count of the a column if possible. +// GetAnalyzeRowCount tries to get the row count of a column or an index if possible. // This method is useful because this row count doesn't consider the modify count. -func (t *Table) GetColRowCount() float64 { - ids := make([]int64, 0, len(t.Columns)) - for id := range t.Columns { - ids = append(ids, id) - } +func (coll *HistColl) GetAnalyzeRowCount() float64 { + ids := maps.Keys(coll.Columns) slices.Sort(ids) for _, id := range ids { - col := t.Columns[id] + col := coll.Columns[id] if col != nil && col.IsFullLoad() { return col.TotalRowCount() } } + ids = maps.Keys(coll.Indices) + slices.Sort(ids) + for _, id := range ids { + idx := coll.Indices[id] + if idx != nil && idx.IsFullLoad() { + return idx.TotalRowCount() + } + } return -1 } @@ -395,7 +424,7 @@ func (t *Table) GetStatsHealthy() (int64, bool) { } var healthy int64 count := float64(t.RealtimeCount) - if histCount := t.GetColRowCount(); histCount > 0 { + if histCount := t.GetAnalyzeRowCount(); histCount > 0 { count = histCount } if float64(t.ModifyCount) < count { @@ -460,7 +489,7 @@ func (t *Table) IsInitialized() bool { // IsOutdated returns true if the table stats is outdated. func (t *Table) IsOutdated() bool { - rowcount := t.GetColRowCount() + rowcount := t.GetAnalyzeRowCount() if rowcount < 0 { rowcount = float64(t.RealtimeCount) } @@ -550,9 +579,11 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, c } // PseudoTable creates a pseudo table statistics. -func PseudoTable(tblInfo *model.TableInfo) *Table { +// Usually, we don't want to trigger stats loading for pseudo table. +// But there are exceptional cases. In such cases, we should pass allowTriggerLoading as true. +// Such case could possibly happen in getStatsTable(). +func PseudoTable(tblInfo *model.TableInfo, allowTriggerLoading bool) *Table { const fakePhysicalID int64 = -1 - pseudoHistColl := HistColl{ RealtimeCount: PseudoRowCount, PhysicalID: tblInfo.ID, @@ -575,6 +606,9 @@ func PseudoTable(tblInfo *model.TableInfo) *Table { IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(col.GetFlag()), Histogram: *NewHistogram(col.ID, 0, 0, 0, &col.FieldType, 0, 0), } + if allowTriggerLoading { + t.Columns[col.ID].PhysicalID = tblInfo.ID + } } } for _, idx := range tblInfo.Indices { @@ -582,7 +616,11 @@ func PseudoTable(tblInfo *model.TableInfo) *Table { t.Indices[idx.ID] = &Index{ PhysicalID: fakePhysicalID, Info: idx, - Histogram: *NewHistogram(idx.ID, 0, 0, 0, types.NewFieldType(mysql.TypeBlob), 0, 0)} + Histogram: *NewHistogram(idx.ID, 0, 0, 0, types.NewFieldType(mysql.TypeBlob), 0, 0), + } + if allowTriggerLoading { + t.Indices[idx.ID].PhysicalID = tblInfo.ID + } } } return t diff --git a/table/tables/tables.go b/table/tables/tables.go index 1f687823780d8..91230778b0db5 100644 --- a/table/tables/tables.go +++ b/table/tables/tables.go @@ -2341,7 +2341,7 @@ type TemporaryTable struct { func TempTableFromMeta(tblInfo *model.TableInfo) tableutil.TempTable { return &TemporaryTable{ modified: false, - stats: statistics.PseudoTable(tblInfo), + stats: statistics.PseudoTable(tblInfo, false), autoIDAllocator: autoid.NewAllocatorFromTempTblInfo(tblInfo), meta: tblInfo, }