Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

planner, statistics: use the correct column ID when recording stats loading status #52208

Merged
merged 13 commits into from
Apr 2, 2024
Merged
1 change: 1 addition & 0 deletions build/nogo_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@
"fieldalignment": {
"exclude_files": {
"pkg/parser/parser.go": "parser/parser.go code",
"pkg/statistics/table.go": "disable this limitation that prevents us from splitting struct fields for clarity",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I always want to do this, but I can never convince myself. 😃

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😂 I think performance benefits and memory efficiency from this rule are small. So I think it's totally OK to disable it when it causes problems or inconvenience.

"external/": "no need to vet third party code",
".*_generated\\.go$": "ignore generated code",
".*mock.go$": "ignore generated code",
Expand Down
2 changes: 1 addition & 1 deletion pkg/planner/cardinality/cross_estimation.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func crossEstimateRowCount(sctx context.PlanContext,
return 0, err == nil, corr
}
idxID := int64(-1)
idxIDs, idxExists := dsStatsInfo.HistColl.ColID2IdxIDs[colID]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to use the clearly name 'colUniqueID' instead of 'colID' to avoid the wrong used.

idxIDs, idxExists := dsStatsInfo.HistColl.ColUniqueID2IdxIDs[colID]
if idxExists && len(idxIDs) > 0 {
idxID = idxIDs[0]
}
Expand Down
12 changes: 10 additions & 2 deletions pkg/planner/cardinality/row_count_column.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ func GetRowCountByColumnRanges(sctx context.PlanContext, coll *statistics.HistCo
}
sc := sctx.GetSessionVars().StmtCtx
c, ok := coll.Columns[colID]
recordUsedItemStatsStatus(sctx, c, coll.PhysicalID, colID)
colInfoID := colID
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

if len(coll.UniqueID2colInfoID) > 0 {
colInfoID = coll.UniqueID2colInfoID[colID]
}
recordUsedItemStatsStatus(sctx, c, coll.PhysicalID, colInfoID)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should distinguish colId and colUniqueId in explain such as "colID: xxx" , "colUniqueID: XXX"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want the column name in the EXPLAIN result, which should be fetched using the column ID from the metadata instead of the UniqueID.

if c != nil && c.Info != nil {
name = c.Info.Name.O
}
Expand Down Expand Up @@ -83,7 +87,11 @@ func GetRowCountByIntColumnRanges(sctx context.PlanContext, coll *statistics.His
}
sc := sctx.GetSessionVars().StmtCtx
c, ok := coll.Columns[colID]
recordUsedItemStatsStatus(sctx, c, coll.PhysicalID, colID)
colInfoID := colID
if len(coll.UniqueID2colInfoID) > 0 {
colInfoID = coll.UniqueID2colInfoID[colID]
}
recordUsedItemStatsStatus(sctx, c, coll.PhysicalID, colInfoID)
if c != nil && c.Info != nil {
name = c.Info.Name.O
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/planner/cardinality/row_count_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,15 +170,15 @@ func getIndexRowCountForStatsV1(sctx context.PlanContext, coll *statistics.HistC
}
var count float64
var err error
colIDs := coll.Idx2ColumnIDs[idxID]
colIDs := coll.Idx2ColUniqueIDs[idxID]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Index ID is not relevant here, only Column ID has this difference.

var colID int64
if rangePosition >= len(colIDs) {
colID = -1
} else {
colID = colIDs[rangePosition]
}
// prefer index stats over column stats
if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && len(idxIDs) > 0 {
if idxIDs, ok := coll.ColUniqueID2IdxIDs[colID]; ok && len(idxIDs) > 0 {
idxID := idxIDs[0]
count, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang})
} else {
Expand Down Expand Up @@ -422,7 +422,7 @@ func expBackoffEstimation(sctx context.PlanContext, idx *statistics.Index, coll
Collators: make([]collate.Collator, 1),
},
}
colsIDs := coll.Idx2ColumnIDs[idx.Histogram.ID]
colsIDs := coll.Idx2ColUniqueIDs[idx.Histogram.ID]
singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal))
// The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like:
// 1. Calc the selectivity of each column.
Expand All @@ -449,7 +449,7 @@ func expBackoffEstimation(sctx context.PlanContext, idx *statistics.Index, coll
count, err = GetRowCountByColumnRanges(sctx, coll, colID, tmpRan)
selectivity = count / float64(coll.RealtimeCount)
}
if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 {
if idxIDs, ok := coll.ColUniqueID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 {
// Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call
// `GetRowCountByIndexRanges()` when the input `indexRange` is a multi-column range. This
// check avoids infinite recursion.
Expand Down
8 changes: 4 additions & 4 deletions pkg/planner/cardinality/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func Selectivity(
})
continue
}
idxCols := findPrefixOfIndexByCol(ctx, extractedCols, coll.Idx2ColumnIDs[id], id2Paths[idxStats.ID])
idxCols := findPrefixOfIndexByCol(ctx, extractedCols, coll.Idx2ColUniqueIDs[id], id2Paths[idxStats.ID])
if len(idxCols) > 0 {
lengths := make([]int, 0, len(idxCols))
for i := 0; i < len(idxCols) && i < len(idxStats.Info.Columns); i++ {
Expand Down Expand Up @@ -919,7 +919,7 @@ func findAvailableStatsForCol(sctx context.PlanContext, coll *statistics.HistCol
return false, uniqueID
}
// try to find available stats in single column index stats (except for prefix index)
for idxStatsIdx, cols := range coll.Idx2ColumnIDs {
for idxStatsIdx, cols := range coll.Idx2ColUniqueIDs {
if len(cols) == 1 && cols[0] == uniqueID {
idxStats := coll.Indices[idxStatsIdx]
if !statistics.IndexStatsIsInvalid(sctx, idxStats, coll, idxStatsIdx) &&
Expand Down Expand Up @@ -968,7 +968,7 @@ func getEqualCondSelectivity(sctx context.PlanContext, coll *statistics.HistColl
return outOfRangeEQSelectivity(sctx, idx.NDV, realtimeCnt, int64(idx.TotalRowCount())), nil
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
colIDs := coll.Idx2ColUniqueIDs[idx.ID]
var ndv int64
for i, colID := range colIDs {
if i >= usedColsLen {
Expand Down Expand Up @@ -1050,7 +1050,7 @@ func crossValidationSelectivity(
}()
}
minRowCount = math.MaxFloat64
cols := coll.Idx2ColumnIDs[idx.ID]
cols := coll.Idx2ColUniqueIDs[idx.ID]
crossValidationSelectivity = 1.0
totalRowCount := idx.TotalRowCount()
for i, colID := range cols {
Expand Down
4 changes: 2 additions & 2 deletions pkg/planner/cardinality/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -893,8 +893,8 @@ func generateMapsForMockStatsTbl(statsTbl *statistics.Table) {
for _, idxIDs := range colID2IdxIDs {
slices.Sort(idxIDs)
}
statsTbl.Idx2ColumnIDs = idx2Columns
statsTbl.ColID2IdxIDs = colID2IdxIDs
statsTbl.Idx2ColUniqueIDs = idx2Columns
statsTbl.ColUniqueID2IdxIDs = colID2IdxIDs
}

func TestIssue39593(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/planner/core/casetest/planstats/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ go_test(
],
data = glob(["testdata/**"]),
flaky = True,
shard_count = 4,
shard_count = 5,
deps = [
"//pkg/config",
"//pkg/domain",
Expand Down
45 changes: 45 additions & 0 deletions pkg/planner/core/casetest/planstats/plan_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,48 @@ func TestCollectDependingVirtualCols(t *testing.T) {
require.Equal(t, output[i].OutputColNames, cols)
}
}

func TestPartialStatsInExplain(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int, c int, primary key(a), key idx(b))")
tk.MustExec("insert into t values (1,1,1),(2,2,2),(3,3,3)")
tk.MustExec("create table t2(a int, primary key(a))")
tk.MustExec("insert into t2 values (1),(2),(3)")
tk.MustExec(
"create table tp(a int, b int, c int, index ic(c)) partition by range(a)" +
"(partition p0 values less than (10)," +
"partition p1 values less than (20)," +
"partition p2 values less than maxvalue)",
)
tk.MustExec("insert into tp values (1,1,1),(2,2,2),(13,13,13),(14,14,14),(25,25,25),(36,36,36)")

oriLease := dom.StatsHandle().Lease()
dom.StatsHandle().SetLease(1)
defer func() {
dom.StatsHandle().SetLease(oriLease)
}()
tk.MustExec("analyze table t")
tk.MustExec("analyze table t2")
tk.MustExec("analyze table tp")
tk.RequireNoError(dom.StatsHandle().Update(dom.InfoSchema()))
tk.MustQuery("explain select * from tp where a = 1")
tk.MustExec("set @@tidb_stats_load_sync_wait = 0")
var (
input []string
output []struct {
Query string
Result []string
}
)
testData := GetPlanStatsData()
testData.LoadTestCases(t, &input, &output)
for i, sql := range input {
testdata.OnRecord(func() {
output[i].Query = input[i]
output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(sql).Rows())
})
tk.MustQuery(sql).Check(testkit.Rows(output[i].Result...))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,13 @@
]
}
]
},
{
"name": "TestPartialStatsInExplain",
"cases": [
"explain format = brief select * from tp where b = 10",
"explain format = brief select * from t join tp where tp.a = 10 and t.b = tp.c",
"explain format = brief select * from t join tp partition (p0) join t2 where t.a < 10 and t.b = tp.c and t2.a > 10 and t2.a = tp.c"
]
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,47 @@
]
}
]
},
{
"Name": "TestPartialStatsInExplain",
"Cases": [
{
"Query": "explain format = brief select * from tp where b = 10",
"Result": [
"TableReader 0.01 root partition:all data:Selection",
"└─Selection 0.01 cop[tikv] eq(test.tp.b, 10)",
" └─TableFullScan 6.00 cop[tikv] table:tp keep order:false, stats:partial[b:allEvicted]"
]
},
{
"Query": "explain format = brief select * from t join tp where tp.a = 10 and t.b = tp.c",
"Result": [
"Projection 0.00 root test.t.a, test.t.b, test.t.c, test.tp.a, test.tp.b, test.tp.c",
"└─HashJoin 0.00 root inner join, equal:[eq(test.tp.c, test.t.b)]",
" ├─TableReader(Build) 0.00 root partition:p1 data:Selection",
" │ └─Selection 0.00 cop[tikv] eq(test.tp.a, 10), not(isnull(test.tp.c))",
" │ └─TableFullScan 6.00 cop[tikv] table:tp keep order:false, stats:partial[c:allEvicted]",
" └─TableReader(Probe) 3.00 root data:Selection",
" └─Selection 3.00 cop[tikv] not(isnull(test.t.b))",
" └─TableFullScan 3.00 cop[tikv] table:t keep order:false, stats:partial[idx:allEvicted, a:allEvicted, b:allEvicted]"
]
},
{
"Query": "explain format = brief select * from t join tp partition (p0) join t2 where t.a < 10 and t.b = tp.c and t2.a > 10 and t2.a = tp.c",
"Result": [
"HashJoin 0.33 root inner join, equal:[eq(test.tp.c, test.t2.a)]",
"├─IndexJoin(Build) 0.33 root inner join, inner:IndexLookUp, outer key:test.t.b, inner key:test.tp.c, equal cond:eq(test.t.b, test.tp.c)",
"│ ├─TableReader(Build) 0.33 root data:Selection",
"│ │ └─Selection 0.33 cop[tikv] gt(test.t.b, 10), not(isnull(test.t.b))",
"│ │ └─TableRangeScan 1.00 cop[tikv] table:t range:[-inf,10), keep order:false, stats:partial[idx:allEvicted, a:allEvicted, b:allEvicted]",
"│ └─IndexLookUp(Probe) 0.33 root partition:p0 ",
"│ ├─Selection(Build) 0.33 cop[tikv] gt(test.tp.c, 10), not(isnull(test.tp.c))",
"│ │ └─IndexRangeScan 0.50 cop[tikv] table:tp, index:ic(c) range: decided by [eq(test.tp.c, test.t.b)], keep order:false, stats:partial[c:allEvicted]",
"│ └─TableRowIDScan(Probe) 0.33 cop[tikv] table:tp keep order:false, stats:partial[c:allEvicted]",
"└─TableReader(Probe) 1.00 root data:TableRangeScan",
" └─TableRangeScan 1.00 cop[tikv] table:t2 range:(10,+inf], keep order:false, stats:partial[a:allEvicted]"
]
}
]
}
]
2 changes: 1 addition & 1 deletion pkg/planner/core/exhaust_physical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,7 @@ func getColsNDVLowerBoundFromHistColl(colUIDs []int64, histColl *statistics.Hist
// 2. Try to get NDV from index stats.
// Note that we don't need to specially handle prefix index here, because the NDV of a prefix index is
// equal or less than the corresponding normal index, and that's safe here since we want a lower bound.
for idxID, idxCols := range histColl.Idx2ColumnIDs {
for idxID, idxCols := range histColl.Idx2ColUniqueIDs {
if len(idxCols) != len(colUIDs) {
continue
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/planner/core/logical_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -1798,8 +1798,8 @@ func (ds *DataSource) fillIndexPath(path *util.AccessPath, conds []expression.Ex
path.IdxCols = append(path.IdxCols, handleCol)
path.IdxColLens = append(path.IdxColLens, types.UnspecifiedLength)
// Also updates the map that maps the index id to its prefix column ids.
if len(ds.tableStats.HistColl.Idx2ColumnIDs[path.Index.ID]) == len(path.Index.Columns) {
ds.tableStats.HistColl.Idx2ColumnIDs[path.Index.ID] = append(ds.tableStats.HistColl.Idx2ColumnIDs[path.Index.ID], handleCol.UniqueID)
if len(ds.tableStats.HistColl.Idx2ColUniqueIDs[path.Index.ID]) == len(path.Index.Columns) {
ds.tableStats.HistColl.Idx2ColUniqueIDs[path.Index.ID] = append(ds.tableStats.HistColl.Idx2ColUniqueIDs[path.Index.ID], handleCol.UniqueID)
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/planner/core/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ func (ds *DataSource) getGroupNDVs(colGroups [][]*expression.Column) []property.
tbl := ds.tableStats.HistColl
ndvs := make([]property.GroupNDV, 0, len(colGroups))
for idxID, idx := range tbl.Indices {
colsLen := len(tbl.Idx2ColumnIDs[idxID])
// tbl.Idx2ColumnIDs may only contain the prefix of index columns.
colsLen := len(tbl.Idx2ColUniqueIDs[idxID])
// tbl.Idx2ColUniqueIDs may only contain the prefix of index columns.
// But it may exceeds the total index since the index would contain the handle column if it's not a unique index.
// We append the handle at fillIndexPath.
if colsLen < len(idx.Info.Columns) {
Expand All @@ -186,7 +186,7 @@ func (ds *DataSource) getGroupNDVs(colGroups [][]*expression.Column) []property.
colsLen--
}
idxCols := make([]int64, colsLen)
copy(idxCols, tbl.Idx2ColumnIDs[idxID])
copy(idxCols, tbl.Idx2ColUniqueIDs[idxID])
slices.Sort(idxCols)
for _, g := range colGroups {
// We only want those exact matches.
Expand Down
62 changes: 38 additions & 24 deletions pkg/statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,17 +215,11 @@ const (

// HistColl is a collection of histogram. It collects enough information for plan to calculate the selectivity.
type HistColl struct {
Columns map[int64]*Column
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Big thanks for refactoring this structure!

Indices map[int64]*Index
// Idx2ColumnIDs maps the index id to its column ids. It's used to calculate the selectivity in planner.
Idx2ColumnIDs map[int64][]int64
// ColID2IdxIDs maps the column id to a list index ids whose first column is it. It's used to calculate the selectivity in planner.
ColID2IdxIDs map[int64][]int64
// MVIdx2Columns maps the index id to its columns by expression.Column.
// For normal index, the column id is enough, as we already have in Idx2ColumnIDs. But currently, mv index needs more
// information to match the filter against the mv index columns, and we need this map to provide this information.
MVIdx2Columns map[int64][]*expression.Column
Comment on lines -221 to -227
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It seems we still using some of these names in our testing code. Feel free to update it or ignore it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I found, that's used inside a function, I think that's OK.

PhysicalID int64
// Note that when used in a query, Column use UniqueID as the key while Indices use the index ID in the
// metadata. (See GenerateHistCollFromColumnInfo() for details)
Columns map[int64]*Column
Indices map[int64]*Index
PhysicalID int64
// TODO: add AnalyzeCount here
RealtimeCount int64 // RealtimeCount is the current table row count, maintained by applying stats delta based on AnalyzeCount.
ModifyCount int64 // Total modify count in a table.
Expand All @@ -234,9 +228,26 @@ type HistColl struct {
StatsVer int
// HavePhysicalID is true means this HistColl is from single table and have its ID's information.
// The physical id is used when try to load column stats from storage.
HavePhysicalID bool
Pseudo bool
HavePhysicalID bool
Pseudo bool

/*
Fields below are only used in a query, like for estimation, and they will be useless when stored in
the stats cache. (See GenerateHistCollFromColumnInfo() for details)
*/

CanNotTriggerLoad bool
// Idx2ColUniqueIDs maps the index id to its column UniqueIDs. It's used to calculate the selectivity in planner.
Idx2ColUniqueIDs map[int64][]int64
// ColUniqueID2IdxIDs maps the column UniqueID to a list index ids whose first column is it.
// It's used to calculate the selectivity in planner.
ColUniqueID2IdxIDs map[int64][]int64
// UniqueID2colInfoID maps the column UniqueID to its ID in the metadata.
UniqueID2colInfoID map[int64]int64
// MVIdx2Columns maps the index id to its columns by expression.Column.
// For normal index, the column id is enough, as we already have in Idx2ColUniqueIDs. But currently, mv index needs more
// information to match the filter against the mv index columns, and we need this map to provide this information.
MVIdx2Columns map[int64][]*expression.Column
}

// TableMemoryUsage records tbl memory usage
Expand Down Expand Up @@ -800,13 +811,15 @@ func (coll *HistColl) ID2UniqueID(columns []*expression.Column) *HistColl {
return newColl
}

// GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxIDs and IdxID2ColIDs is built from the given parameter.
// GenerateHistCollFromColumnInfo generates a new HistColl whose ColUniqueID2IdxIDs and Idx2ColUniqueIDs is built from the given parameter.
func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, columns []*expression.Column) *HistColl {
newColHistMap := make(map[int64]*Column)
colInfoID2UniqueID := make(map[int64]int64, len(columns))
uniqueID2colInfoID := make(map[int64]int64, len(columns))
idxID2idxInfo := make(map[int64]*model.IndexInfo)
for _, col := range columns {
colInfoID2UniqueID[col.ID] = col.UniqueID
uniqueID2colInfoID[col.UniqueID] = col.ID
}
for id, colHist := range coll.Columns {
uniqueID, ok := colInfoID2UniqueID[id]
Expand Down Expand Up @@ -853,16 +866,17 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, c
slices.Sort(idxIDs)
}
newColl := &HistColl{
PhysicalID: coll.PhysicalID,
HavePhysicalID: coll.HavePhysicalID,
Pseudo: coll.Pseudo,
RealtimeCount: coll.RealtimeCount,
ModifyCount: coll.ModifyCount,
Columns: newColHistMap,
Indices: newIdxHistMap,
ColID2IdxIDs: colID2IdxIDs,
Idx2ColumnIDs: idx2Columns,
MVIdx2Columns: mvIdx2Columns,
PhysicalID: coll.PhysicalID,
HavePhysicalID: coll.HavePhysicalID,
Pseudo: coll.Pseudo,
RealtimeCount: coll.RealtimeCount,
ModifyCount: coll.ModifyCount,
Columns: newColHistMap,
Indices: newIdxHistMap,
ColUniqueID2IdxIDs: colID2IdxIDs,
Idx2ColUniqueIDs: idx2Columns,
UniqueID2colInfoID: uniqueID2colInfoID,
MVIdx2Columns: mvIdx2Columns,
}
return newColl
}
Expand Down