Skip to content

Commit

Permalink
planner, statistics: support estimation for mv index access path (#49852
Browse files Browse the repository at this point in the history
)

close #46539
  • Loading branch information
time-and-fate authored Jan 5, 2024
1 parent 17d24f4 commit bf166d9
Show file tree
Hide file tree
Showing 11 changed files with 498 additions and 269 deletions.
160 changes: 80 additions & 80 deletions pkg/executor/test/analyzetest/analyze_test.go

Large diffs are not rendered by default.

34 changes: 18 additions & 16 deletions pkg/planner/cardinality/row_count_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,16 @@ func GetRowCountByIndexRanges(sctx sessionctx.Context, coll *statistics.HistColl
sc := sctx.GetSessionVars().StmtCtx
idx, ok := coll.Indices[idxID]
colNames := make([]string, 0, 8)
isMVIndex := false
if ok {
if idx.Info != nil {
name = idx.Info.Name.O
for _, col := range idx.Info.Columns {
colNames = append(colNames, col.Name.O)
}
isMVIndex = idx.Info.MVIndex
}
}
recordUsedItemStatsStatus(sctx, idx, coll.PhysicalID, idxID)
// For the mv index case, now we have supported collecting stats and async loading stats, but sync loading and
// estimation is not well-supported, so we keep mv index using pseudo estimation for this period of time.
if !ok || idx.IsInvalid(sctx, coll.Pseudo) || isMVIndex {
if !ok || idx.IsInvalid(sctx, coll.Pseudo) {
colsLen := -1
if idx != nil && idx.Info.Unique {
colsLen = len(idx.Info.Columns)
Expand All @@ -74,17 +70,18 @@ func GetRowCountByIndexRanges(sctx sessionctx.Context, coll *statistics.HistColl
}
return result, err
}
realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx)
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.RecordAnyValuesWithNames(sctx,
"Histogram NotNull Count", idx.Histogram.NotNullCount(),
"TopN total count", idx.TopN.TotalCount(),
"Increase Factor", idx.GetIncreaseFactor(coll.RealtimeCount),
"Increase Factor", idx.GetIncreaseFactor(realtimeCnt),
)
}
if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 {
result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges)
} else {
result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, coll.RealtimeCount, coll.ModifyCount)
result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount)
}
if sc.EnableOptimizerCETrace {
ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result))
Expand Down Expand Up @@ -118,7 +115,8 @@ func getIndexRowCountForStatsV1(sctx sessionctx.Context, coll *statistics.HistCo
// on single-column index, use previous way as well, because CMSketch does not contain null
// values in this case.
if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, coll.RealtimeCount, coll.ModifyCount)
realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx)
count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
Expand Down Expand Up @@ -432,13 +430,15 @@ func expBackoffEstimation(sctx sessionctx.Context, idx *statistics.Index, coll *
}
colID := colsIDs[i]
var (
count float64
err error
foundStats bool
count float64
selectivity float64
err error
foundStats bool
)
if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) {
foundStats = true
count, err = GetRowCountByColumnRanges(sctx, coll, colID, tmpRan)
selectivity = count / float64(coll.RealtimeCount)
}
if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 {
// Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call
Expand All @@ -448,11 +448,17 @@ func expBackoffEstimation(sctx sessionctx.Context, idx *statistics.Index, coll *
if idxID == idx.Histogram.ID {
continue
}
idxStats, ok := coll.Indices[idxID]
if !ok || idxStats.IsInvalid(sctx, coll.Pseudo) {
continue
}
foundStats = true
count, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan)
if err == nil {
break
}
realtimeCnt, _ := coll.GetScaledRealtimeAndModifyCnt(idxStats)
selectivity = count / float64(realtimeCnt)
}
}
if !foundStats {
Expand All @@ -461,15 +467,11 @@ func expBackoffEstimation(sctx sessionctx.Context, idx *statistics.Index, coll *
if err != nil {
return 0, false, err
}
singleColumnEstResults = append(singleColumnEstResults, count)
singleColumnEstResults = append(singleColumnEstResults, selectivity)
}
// Sort them.
slices.Sort(singleColumnEstResults)
l := len(singleColumnEstResults)
// Convert the first 4 to selectivity results.
for i := 0; i < l && i < 4; i++ {
singleColumnEstResults[i] = singleColumnEstResults[i] / float64(coll.RealtimeCount)
}
failpoint.Inject("cleanEstResults", func() {
singleColumnEstResults = singleColumnEstResults[:0]
l = 0
Expand Down
155 changes: 153 additions & 2 deletions pkg/planner/cardinality/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/expression"
"github.com/pingcap/tidb/pkg/parser/ast"
"github.com/pingcap/tidb/pkg/parser/model"
planutil "github.com/pingcap/tidb/pkg/planner/util"
"github.com/pingcap/tidb/pkg/planner/util/debugtrace"
"github.com/pingcap/tidb/pkg/sessionctx"
Expand All @@ -32,6 +33,7 @@ import (
"github.com/pingcap/tidb/pkg/util/codec"
"github.com/pingcap/tidb/pkg/util/collate"
"github.com/pingcap/tidb/pkg/util/logutil"
"github.com/pingcap/tidb/pkg/util/mathutil"
"github.com/pingcap/tidb/pkg/util/ranger"
"go.uber.org/zap"
"golang.org/x/exp/maps"
Expand Down Expand Up @@ -157,6 +159,21 @@ func Selectivity(
slices.Sort(idxIDs)
for _, id := range idxIDs {
idxStats := coll.Indices[id]
idxInfo := idxStats.Info
if idxInfo.MVIndex {
totalSelectivity, mask, ok := getMaskAndSelectivityForMVIndex(ctx, coll, id, remainedExprs)
if !ok {
continue
}
nodes = append(nodes, &StatsNode{
Tp: IndexType,
ID: id,
mask: mask,
numCols: len(idxInfo.Columns),
Selectivity: totalSelectivity,
})
continue
}
idxCols := findPrefixOfIndexByCol(ctx, extractedCols, coll.Idx2ColumnIDs[id], id2Paths[idxStats.ID])
if len(idxCols) > 0 {
lengths := make([]int, 0, len(idxCols))
Expand Down Expand Up @@ -417,6 +434,83 @@ OUTER:
return ret, nodes, nil
}

// CalcTotalSelectivityForMVIdxPath calculates the total selectivity for the given partial paths of an MV index merge path.
// It corresponds with the meaning of AccessPath.CountAfterAccess, as used in buildPartialPathUp4MVIndex.
// It uses the independence assumption to estimate the selectivity.
func CalcTotalSelectivityForMVIdxPath(
coll *statistics.HistColl,
partialPaths []*planutil.AccessPath,
isIntersection bool,
) float64 {
selectivities := make([]float64, 0, len(partialPaths))
for _, path := range partialPaths {
// For a partial path, we distinguish between two cases.
// 1. We will access a single value on the virtual column of the mv index.
// In this case, handles from a single partial path must be unique.
// The CountAfterAccess of a partial path will never be larger than the table total row count.
// For an index merge path with only one partial path, the CountAfterAccess will be exactly the same as the
// CountAfterAccess of the partial path (currently there's no index filter for partial path of mv index merge
// path).
// 2. We use the mv index as if it's a non-MV index, which means the virtual column is not involved in the access
// conditions.
// In this case, we may read repeated handles from a single partial path.
// The CountAfterAccess of a partial path might be larger than the table total row count.
// For an index merge path with only one partial path, the CountAfterAccess might be less than the CountAfterAccess
// of the partial path
// For example:
// create table t(a int, d json, index iad(a, (cast(d->'$.b' as signed array))));
// insert into t value(1,'{"b":[1,2,3,4]}'), (2,'{"b":[3,4,5,6]}');
// The index has 8 entries.
// Case 1:
// select * from t use index (iad) where a = 1 and 1 member of (d->'$.b');
// IndexMerge
// ├─IndexRangeScan RowCount:1 Range:[1 1,1 1]
// └─TableRowIDScan RowCount:1
// Case 2:
// select * from t use index (iad) where a = 1;
// IndexMerge
// ├─IndexRangeScan RowCount:4 Range:[1,1]
// └─TableRowIDScan RowCount:1
// From the example, it should be obvious that we need different total row count to calculate the selectivity of
// the access conditions:
// Case 1: Here we should use the table total row count
// Selectivity( a = 1 and 1 member of (d->'$.b') ) = 1 / 2
// Case 2: Here we should use the index total row count
// Selectivity( a = 1 ) = 4 / 8
var virtualCol *expression.Column
for _, col := range coll.MVIdx2Columns[path.Index.ID] {
if col.VirtualExpr != nil {
virtualCol = col
break
}
}
cols := expression.ExtractColumnsFromExpressions(nil, path.AccessConds, func(column *expression.Column) bool {
return virtualCol != nil && column.UniqueID == virtualCol.UniqueID
})
realtimeCount := coll.RealtimeCount
// If we can't find the virtual column from the access conditions, it's the case 2.
if len(cols) == 0 {
realtimeCount, _ = coll.GetScaledRealtimeAndModifyCnt(coll.Indices[path.Index.ID])
}
sel := path.CountAfterAccess / float64(realtimeCount)
sel = mathutil.Clamp(sel, 0, 1)
selectivities = append(selectivities, sel)
}
var totalSelectivity float64
if isIntersection {
totalSelectivity = 1
for _, sel := range selectivities {
totalSelectivity *= sel
}
} else {
totalSelectivity = 0
for _, sel := range selectivities {
totalSelectivity = (sel + totalSelectivity) - totalSelectivity*sel
}
}
return totalSelectivity
}

// StatsNode is used for calculating selectivity.
type StatsNode struct {
// Ranges contains all the Ranges we got.
Expand Down Expand Up @@ -621,6 +715,36 @@ func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, ran
return mask, ranges, false, nil
}

func getMaskAndSelectivityForMVIndex(
ctx sessionctx.Context,
coll *statistics.HistColl,
id int64,
exprs []expression.Expression,
) (float64, int64, bool) {
cols := coll.MVIdx2Columns[id]
if len(cols) == 0 {
return 1.0, 0, false
}
// You can find more examples and explanations in comments for collectFilters4MVIndex() and
// buildPartialPaths4MVIndex() in planner/core.
accessConds, _ := CollectFilters4MVIndex(ctx, exprs, cols)
paths, isIntersection, ok, err := BuildPartialPaths4MVIndex(ctx, accessConds, cols, coll.Indices[id].Info, coll)
if err != nil || !ok {
return 1.0, 0, false
}
totalSelectivity := CalcTotalSelectivityForMVIdxPath(coll, paths, isIntersection)
var mask int64
for i := range exprs {
for _, accessCond := range accessConds {
if exprs[i].Equal(ctx, accessCond) {
mask |= 1 << uint64(i)
break
}
}
}
return totalSelectivity, mask, true
}

// GetSelectivityByFilter try to estimate selectivity of expressions by evaluate the expressions using TopN, Histogram buckets boundaries and NULL.
// Currently, this method can only handle expressions involving a single column.
func GetSelectivityByFilter(sctx sessionctx.Context, coll *statistics.HistColl, filters []expression.Expression) (ok bool, selectivity float64, err error) {
Expand Down Expand Up @@ -820,10 +944,11 @@ func getEqualCondSelectivity(sctx sessionctx.Context, coll *statistics.HistColl,
}
val := types.NewBytesDatum(bytes)
if outOfRangeOnIndex(idx, val) {
realtimeCnt, _ := coll.GetScaledRealtimeAndModifyCnt(idx)
// When the value is out of range, we could not found this value in the CM Sketch,
// so we use heuristic methods to estimate the selectivity.
if idx.NDV > 0 && coverAll {
return outOfRangeEQSelectivity(sctx, idx.NDV, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
return outOfRangeEQSelectivity(sctx, idx.NDV, realtimeCnt, int64(idx.TotalRowCount())), nil
}
// The equal condition only uses prefix columns of the index.
colIDs := coll.Idx2ColumnIDs[idx.ID]
Expand All @@ -836,7 +961,7 @@ func getEqualCondSelectivity(sctx sessionctx.Context, coll *statistics.HistColl,
ndv = max(ndv, col.Histogram.NDV)
}
}
return outOfRangeEQSelectivity(sctx, ndv, coll.RealtimeCount, int64(idx.TotalRowCount())), nil
return outOfRangeEQSelectivity(sctx, ndv, realtimeCnt, int64(idx.TotalRowCount())), nil
}

minRowCount, crossValidSelectivity, err := crossValidationSelectivity(sctx, coll, idx, usedColsLen, idxPointRange)
Expand Down Expand Up @@ -942,3 +1067,29 @@ func crossValidationSelectivity(
}
return minRowCount, crossValidationSelectivity, nil
}

// CollectFilters4MVIndex and BuildPartialPaths4MVIndex are for matching JSON expressions against mv index.
// This logic is shared between the estimation logic and the access path generation logic. But the two functions are
// defined in planner/core package and hard to move here. So we use this trick to avoid the import cycle.
var (
CollectFilters4MVIndex func(
sctx sessionctx.Context,
filters []expression.Expression,
idxCols []*expression.Column,
) (
accessFilters,
remainingFilters []expression.Expression,
)
BuildPartialPaths4MVIndex func(
sctx sessionctx.Context,
accessFilters []expression.Expression,
idxCols []*expression.Column,
mvIndex *model.IndexInfo,
histColl *statistics.HistColl,
) (
partialPaths []*planutil.AccessPath,
isIntersection bool,
ok bool,
err error,
)
)
9 changes: 8 additions & 1 deletion pkg/planner/core/find_best_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -2565,7 +2565,14 @@ func (ds *DataSource) getOriginalPhysicalIndexScan(prop *property.PhysicalProper
ds.StatsInfo(), ds.tableStats, ds.statisticTable,
path, prop.ExpectedCnt, isMatchProp && prop.SortItems[0].Desc)
}
is.SetStats(ds.tableStats.ScaleByExpectCnt(rowCount))
// ScaleByExpectCnt only allows to scale the row count smaller than the table total row count.
// But for MV index, it's possible that the IndexRangeScan row count is larger than the table total row count.
// Please see the Case 2 in CalcTotalSelectivityForMVIdxPath for an example.
if idx.MVIndex && rowCount > ds.tableStats.RowCount {
is.SetStats(ds.tableStats.Scale(rowCount / ds.tableStats.RowCount))
} else {
is.SetStats(ds.tableStats.ScaleByExpectCnt(rowCount))
}
usedStats := ds.SCtx().GetSessionVars().StmtCtx.GetUsedStatsInfo(false)
if usedStats != nil && usedStats[is.physicalTableID] != nil {
is.usedStatsInfo = usedStats[is.physicalTableID]
Expand Down
Loading

0 comments on commit bf166d9

Please sign in to comment.