From 4be52661ca1b152b210c011c5f5226420adf24e5 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 13 Oct 2022 20:10:34 +0800 Subject: [PATCH 1/7] add --- planner/core/find_best_task.go | 7 ++++--- statistics/histogram.go | 2 +- statistics/index.go | 12 ++++++++++-- statistics/table.go | 28 ++++++++++++++++++---------- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/planner/core/find_best_task.go b/planner/core/find_best_task.go index c73ce9f3c086d..2389f56da5337 100644 --- a/planner/core/find_best_task.go +++ b/planner/core/find_best_task.go @@ -1711,9 +1711,10 @@ func (ds *DataSource) crossEstimateRowCount(path *util.AccessPath, conds []expre if len(ranges) == 0 || len(accessConds) == 0 || err != nil { return 0, err == nil, corr } - idxID, idxExists := ds.stats.HistColl.ColID2IdxID[colID] - if !idxExists { - idxID = -1 + idxID := int64(-1) + idxIDs, idxExists := ds.stats.HistColl.ColID2IdxIDs[colID] + if idxExists && len(idxIDs) > 0 { + idxID = idxIDs[0] } rangeCounts, ok := getColumnRangeCounts(ds.ctx, colID, ranges, ds.tableStats.HistColl, idxID) if !ok { diff --git a/statistics/histogram.go b/statistics/histogram.go index 2133ccad3b53b..8c662b6f04061 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -997,7 +997,7 @@ func (coll *HistColl) NewHistCollBySelectivity(sctx sessionctx.Context, statsNod Columns: make(map[int64]*Column), Indices: make(map[int64]*Index), Idx2ColumnIDs: coll.Idx2ColumnIDs, - ColID2IdxID: coll.ColID2IdxID, + ColID2IdxIDs: coll.ColID2IdxIDs, Count: coll.Count, } for _, node := range statsNodes { diff --git a/statistics/index.go b/statistics/index.go index 6b8a88501c30e..e49d10fcc7d24 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -349,8 +349,16 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, count float64 err error ) - if anotherIdxID, ok := coll.ColID2IdxID[colID]; ok && anotherIdxID != idx.Histogram.ID { - count, err = coll.GetRowCountByIndexRanges(sctx, anotherIdxID, tmpRan) + if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { + for _, idxID := range idxIDs { + if idxID == idx.Histogram.ID { + continue + } + count, err = coll.GetRowCountByIndexRanges(sctx, idxID, tmpRan) + if err == nil { + break + } + } } else if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) } else { diff --git a/statistics/table.go b/statistics/table.go index 81cb4e9bf284f..80f7e922a83c1 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -101,10 +101,10 @@ type HistColl struct { Indices map[int64]*Index // Idx2ColumnIDs maps the index id to its column ids. It's used to calculate the selectivity in planner. Idx2ColumnIDs map[int64][]int64 - // ColID2IdxID maps the column id to index id whose first column is it. It's used to calculate the selectivity in planner. - ColID2IdxID map[int64]int64 - Count int64 - ModifyCount int64 // Total modify count in a table. + // ColID2IdxIDs maps the column id to a list index ids whose first column is it. It's used to calculate the selectivity in planner. + ColID2IdxIDs map[int64][]int64 + Count int64 + ModifyCount int64 // Total modify count in a table. // HavePhysicalID is true means this HistColl is from single table and have its ID's information. // The physical id is used when try to load column stats from storage. @@ -846,7 +846,7 @@ func (coll *HistColl) ID2UniqueID(columns []*expression.Column) *HistColl { return newColl } -// GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxID and IdxID2ColIDs is built from the given parameter. +// GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxIDs and IdxID2ColIDs is built from the given parameter. func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, columns []*expression.Column) *HistColl { newColHistMap := make(map[int64]*Column) colInfoID2UniqueID := make(map[int64]int64, len(columns)) @@ -869,7 +869,7 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, } newIdxHistMap := make(map[int64]*Index) idx2Columns := make(map[int64][]int64) - colID2IdxID := make(map[int64]int64) + colID2IdxIDs := make(map[int64][]int64) for _, idxHist := range coll.Indices { ids := make([]int64, 0, len(idxHist.Info.Columns)) for _, idxCol := range idxHist.Info.Columns { @@ -883,10 +883,13 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, if len(ids) == 0 { continue } - colID2IdxID[ids[0]] = idxHist.ID + colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID) newIdxHistMap[idxHist.ID] = idxHist idx2Columns[idxHist.ID] = ids } + for _, idxIDs := range colID2IdxIDs { + slices.Sort(idxIDs) + } newColl := &HistColl{ PhysicalID: coll.PhysicalID, HavePhysicalID: coll.HavePhysicalID, @@ -895,7 +898,7 @@ func (coll *HistColl) GenerateHistCollFromColumnInfo(infos []*model.ColumnInfo, ModifyCount: coll.ModifyCount, Columns: newColHistMap, Indices: newIdxHistMap, - ColID2IdxID: colID2IdxID, + ColID2IdxIDs: colID2IdxIDs, Idx2ColumnIDs: idx2Columns, } return newColl @@ -1084,8 +1087,13 @@ func (coll *HistColl) getIndexRowCount(sctx sessionctx.Context, idxID int64, ind colID = colIDs[rangePosition] } // prefer index stats over column stats - if idx, ok := coll.ColID2IdxID[colID]; ok { - count, err = coll.GetRowCountByIndexRanges(sctx, idx, []*ranger.Range{&rang}) + if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { + for _, idxID := range idxIDs { + count, err = coll.GetRowCountByIndexRanges(sctx, idxID, []*ranger.Range{&rang}) + if err == nil { + break + } + } } else { count, err = coll.GetRowCountByColumnRanges(sctx, colID, []*ranger.Range{&rang}) } From 6dda7d6b612c7f2878059e9686a4ca4f7c62d692 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 13 Oct 2022 20:37:50 +0800 Subject: [PATCH 2/7] avoid single col range use exp backoff --- statistics/index.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/statistics/index.go b/statistics/index.go index e49d10fcc7d24..ad8d95ff8c0b6 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -267,7 +267,10 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang expBackoffSuccess := false // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. // If the first column's range is point. - if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= Version2 && coll != nil { + if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && + len(indexRange.LowVal) > 1 && + idx.StatsVer >= Version2 && + coll != nil { var expBackoffSel float64 expBackoffSel, expBackoffSuccess, err = idx.expBackoffEstimation(sctx, coll, indexRange) if err != nil { From d2ec298cd07fc3340179b003454ad4c84986176e Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Thu, 13 Oct 2022 20:55:48 +0800 Subject: [PATCH 3/7] prefer col over index, and update test result --- cmd/explaintest/r/imdbload.result | 4 ++-- statistics/index.go | 6 +++--- statistics/table.go | 10 +++------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cmd/explaintest/r/imdbload.result b/cmd/explaintest/r/imdbload.result index c3ee5badab7e6..00543e6a1640f 100644 --- a/cmd/explaintest/r/imdbload.result +++ b/cmd/explaintest/r/imdbload.result @@ -286,7 +286,7 @@ IndexLookUp_7 1005030.94 root └─TableRowIDScan_6(Probe) 1005030.94 cop[tikv] table:char_name keep order:false trace plan target = 'estimation' select * from char_name where ((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436')); CE_trace -[{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'I'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'L'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((surname_pcode < 'E436'))","row_count":1005030},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`or`(`and`(`eq`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.surname_pcode, 'E436')), `and`(`eq`(imdbload.char_name.imdb_index, 'L'), `lt`(imdbload.char_name.surname_pcode, 'E436')))","row_count":804024}] +[{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'I'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Point","expr":"((imdb_index = 'L'))","row_count":0},{"table_name":"char_name","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":4314864},{"table_name":"char_name","type":"Column Stats-Range","expr":"((surname_pcode < 'E436'))","row_count":1005030},{"table_name":"char_name","type":"Index Stats-Range","expr":"((imdb_index = 'I') and (surname_pcode < 'E436')) or ((imdb_index = 'L') and (surname_pcode < 'E436'))","row_count":0},{"table_name":"char_name","type":"Index Stats-Range","expr":"((surname_pcode < 'E436'))","row_count":1005030},{"table_name":"char_name","type":"Table Stats-Expression-CNF","expr":"`or`(`and`(`eq`(imdbload.char_name.imdb_index, 'I'), `lt`(imdbload.char_name.surname_pcode, 'E436')), `and`(`eq`(imdbload.char_name.imdb_index, 'L'), `lt`(imdbload.char_name.surname_pcode, 'E436')))","row_count":804024}] explain select * from char_name where ((imdb_index = 'V') and (surname_pcode < 'L3416')); id estRows task access object operator info @@ -356,7 +356,7 @@ IndexLookUp_11 901.00 root └─TableRowIDScan_9 901.00 cop[tikv] table:keyword keep order:false trace plan target = 'estimation' select * from keyword where ((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers')); CE_trace -[{"table_name":"keyword","type":"Column Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":236627},{"table_name":"keyword","type":"Column Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44075},{"table_name":"keyword","type":"Index Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword >= 'ecg-m' and keyword <= 'kille'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":901},{"table_name":"keyword","type":"Table Stats-Expression-CNF","expr":"`and`(`eq`(imdbload.keyword.phonetic_code, 'R1652'), `and`(`gt`(imdbload.keyword.keyword, 'ecg-monitor'), `lt`(imdbload.keyword.keyword, 'killers')))","row_count":901}] +[{"table_name":"keyword","type":"Column Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Column Stats-Range","expr":"((id >= -9223372036854775808 and id <= 9223372036854775807))","row_count":236627},{"table_name":"keyword","type":"Column Stats-Range","expr":"((keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":44075},{"table_name":"keyword","type":"Index Stats-Point","expr":"((phonetic_code = 'R1652'))","row_count":23480},{"table_name":"keyword","type":"Index Stats-Range","expr":"((keyword >= 'ecg-m' and keyword <= 'kille'))","row_count":44036},{"table_name":"keyword","type":"Index Stats-Range","expr":"((phonetic_code = 'R1652') and (keyword > 'ecg-monitor' and keyword < 'killers'))","row_count":901},{"table_name":"keyword","type":"Table Stats-Expression-CNF","expr":"`and`(`eq`(imdbload.keyword.phonetic_code, 'R1652'), `and`(`gt`(imdbload.keyword.keyword, 'ecg-monitor'), `lt`(imdbload.keyword.keyword, 'killers')))","row_count":901}] explain select * from cast_info where (nr_order is null) and (person_role_id = 2) and (note >= '(key set pa: Florida'); id estRows task access object operator info diff --git a/statistics/index.go b/statistics/index.go index ad8d95ff8c0b6..8f240b3d79af7 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -352,7 +352,9 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, count float64 err error ) - if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { + if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { + count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) + } else if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { for _, idxID := range idxIDs { if idxID == idx.Histogram.ID { continue @@ -362,8 +364,6 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, break } } - } else if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { - count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) } else { continue } diff --git a/statistics/table.go b/statistics/table.go index 80f7e922a83c1..2e66e39ab8152 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -1087,13 +1087,9 @@ func (coll *HistColl) getIndexRowCount(sctx sessionctx.Context, idxID int64, ind colID = colIDs[rangePosition] } // prefer index stats over column stats - if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { - for _, idxID := range idxIDs { - count, err = coll.GetRowCountByIndexRanges(sctx, idxID, []*ranger.Range{&rang}) - if err == nil { - break - } - } + if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && len(idxIDs) > 0 { + idxID := idxIDs[0] + count, err = coll.GetRowCountByIndexRanges(sctx, idxID, []*ranger.Range{&rang}) } else { count, err = coll.GetRowCountByColumnRanges(sctx, colID, []*ranger.Range{&rang}) } From 9de2b6db076ee848a5f596b0ee915904fc7daf45 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 14 Oct 2022 03:48:25 +0800 Subject: [PATCH 4/7] fix --- statistics/index.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/statistics/index.go b/statistics/index.go index 8f240b3d79af7..dc1b82b4fadcc 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -268,7 +268,6 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. // If the first column's range is point. if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && - len(indexRange.LowVal) > 1 && idx.StatsVer >= Version2 && coll != nil { var expBackoffSel float64 @@ -354,7 +353,7 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, ) if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) - } else if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok { + } else if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && len(indexRange.LowVal) > 1 { for _, idxID := range idxIDs { if idxID == idx.Histogram.ID { continue From a527e958202f9342e5f16ace98d38a9127beb9ad Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 14 Oct 2022 11:59:40 +0800 Subject: [PATCH 5/7] remove redundant changes --- statistics/index.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/statistics/index.go b/statistics/index.go index dc1b82b4fadcc..80d2a4f1a64b3 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -267,9 +267,7 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang expBackoffSuccess := false // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. // If the first column's range is point. - if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && - idx.StatsVer >= Version2 && - coll != nil { + if rangePosition := GetOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= Version2 && coll != nil { var expBackoffSel float64 expBackoffSel, expBackoffSuccess, err = idx.expBackoffEstimation(sctx, coll, indexRange) if err != nil { From fcce2912c560037cb2733e146d1f81e38359b837 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Fri, 14 Oct 2022 12:46:44 +0800 Subject: [PATCH 6/7] add comments --- statistics/index.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/statistics/index.go b/statistics/index.go index 80d2a4f1a64b3..3816999565212 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -352,6 +352,9 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) } else if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && len(indexRange.LowVal) > 1 { + // Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call + // `GetRowCountByIndexRanges()` when the input `indexRange` is a multi-column range. This + // check avoids infinite recursion. for _, idxID := range idxIDs { if idxID == idx.Histogram.ID { continue From 41541d2f45e97562cd47d493fb41b4135d4acf91 Mon Sep 17 00:00:00 2001 From: time-and-fate <25057648+time-and-fate@users.noreply.github.com> Date: Tue, 18 Oct 2022 15:09:52 +0800 Subject: [PATCH 7/7] fix --- statistics/index.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/statistics/index.go b/statistics/index.go index 3816999565212..71d2aa839bd61 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -346,12 +346,15 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, } colID := colsIDs[i] var ( - count float64 - err error + count float64 + err error + foundStats bool ) if col, ok := coll.Columns[colID]; ok && !col.IsInvalid(sctx, coll.Pseudo) { + foundStats = true count, err = coll.GetRowCountByColumnRanges(sctx, colID, tmpRan) - } else if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && len(indexRange.LowVal) > 1 { + } + if idxIDs, ok := coll.ColID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 { // Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call // `GetRowCountByIndexRanges()` when the input `indexRange` is a multi-column range. This // check avoids infinite recursion. @@ -359,12 +362,14 @@ func (idx *Index) expBackoffEstimation(sctx sessionctx.Context, coll *HistColl, if idxID == idx.Histogram.ID { continue } + foundStats = true count, err = coll.GetRowCountByIndexRanges(sctx, idxID, tmpRan) if err == nil { break } } - } else { + } + if !foundStats { continue } if err != nil {