From 4d329e5e2f9350c9d927f35f168337786123a5a5 Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:03:13 +0800 Subject: [PATCH] This is an automated cherry-pick of #42074 Signed-off-by: ti-chi-bot --- statistics/integration_test.go | 184 +++++++++++ statistics/interact_with_storage.go | 484 ++++++++++++++++++++++++++++ 2 files changed, 668 insertions(+) create mode 100644 statistics/interact_with_storage.go diff --git a/statistics/integration_test.go b/statistics/integration_test.go index e2a52d6401ad3..11f1585f4e639 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -559,3 +559,187 @@ func hasPseudoStats(rows [][]interface{}) bool { } return false } +<<<<<<< HEAD +======= + +// TestNotLoadedStatsOnAllNULLCol makes sure that stats on a column that only contains NULLs can be used even when it's +// not loaded. This is reasonable because it makes no difference whether it's loaded or not. +func TestNotLoadedStatsOnAllNULLCol(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + h := dom.StatsHandle() + oriLease := h.Lease() + h.SetLease(1000) + defer func() { + h.SetLease(oriLease) + }() + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec("drop table if exists t1") + tk.MustExec("drop table if exists t2") + tk.MustExec("create table t1(a int)") + tk.MustExec("create table t2(a int)") + tk.MustExec("insert into t1 values(null), (null), (null), (null)") + tk.MustExec("insert into t2 values(null), (null)") + tk.MustExec("analyze table t1;") + tk.MustExec("analyze table t2;") + + res := tk.MustQuery("explain format = 'brief' select * from t1 left join t2 on t1.a=t2.a order by t1.a, t2.a") + res.Check(testkit.Rows( + "Sort 4.00 root test.t1.a, test.t2.a", + "└─HashJoin 4.00 root left outer join, equal:[eq(test.t1.a, test.t2.a)]", + " ├─TableReader(Build) 0.00 root data:Selection", + // If we are not using stats on this column (which means we use pseudo estimation), the row count for the Selection will become 2. + " │ └─Selection 0.00 cop[tikv] not(isnull(test.t2.a))", + " │ └─TableFullScan 2.00 cop[tikv] table:t2 keep order:false", + " └─TableReader(Probe) 4.00 root data:TableFullScan", + " └─TableFullScan 4.00 cop[tikv] table:t1 keep order:false")) + + res = tk.MustQuery("explain format = 'brief' select * from t2 left join t1 on t1.a=t2.a order by t1.a, t2.a") + res.Check(testkit.Rows( + "Sort 2.00 root test.t1.a, test.t2.a", + "└─HashJoin 2.00 root left outer join, equal:[eq(test.t2.a, test.t1.a)]", + // If we are not using stats on this column, the build side will become t2 because of smaller row count. + " ├─TableReader(Build) 0.00 root data:Selection", + // If we are not using stats on this column, the row count for the Selection will become 4. + " │ └─Selection 0.00 cop[tikv] not(isnull(test.t1.a))", + " │ └─TableFullScan 4.00 cop[tikv] table:t1 keep order:false", + " └─TableReader(Probe) 2.00 root data:TableFullScan", + " └─TableFullScan 2.00 cop[tikv] table:t2 keep order:false")) + + res = tk.MustQuery("explain format = 'brief' select * from t1 right join t2 on t1.a=t2.a order by t1.a, t2.a") + res.Check(testkit.Rows( + "Sort 2.00 root test.t1.a, test.t2.a", + "└─HashJoin 2.00 root right outer join, equal:[eq(test.t1.a, test.t2.a)]", + " ├─TableReader(Build) 0.00 root data:Selection", + " │ └─Selection 0.00 cop[tikv] not(isnull(test.t1.a))", + " │ └─TableFullScan 4.00 cop[tikv] table:t1 keep order:false", + " └─TableReader(Probe) 2.00 root data:TableFullScan", + " └─TableFullScan 2.00 cop[tikv] table:t2 keep order:false")) + + res = tk.MustQuery("explain format = 'brief' select * from t2 right join t1 on t1.a=t2.a order by t1.a, t2.a") + res.Check(testkit.Rows( + "Sort 4.00 root test.t1.a, test.t2.a", + "└─HashJoin 4.00 root right outer join, equal:[eq(test.t2.a, test.t1.a)]", + " ├─TableReader(Build) 0.00 root data:Selection", + " │ └─Selection 0.00 cop[tikv] not(isnull(test.t2.a))", + " │ └─TableFullScan 2.00 cop[tikv] table:t2 keep order:false", + " └─TableReader(Probe) 4.00 root data:TableFullScan", + " └─TableFullScan 4.00 cop[tikv] table:t1 keep order:false")) +} + +func TestCrossValidationSelectivity(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + tk.MustExec("use test") + tk.MustExec("drop table if exists t") + tk.MustExec("set @@tidb_analyze_version = 1") + tk.MustExec("create table t (a int, b int, c int, primary key (a, b) clustered)") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values (1,2,3), (1,4,5)") + require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + tk.MustExec("analyze table t") + tk.MustQuery("explain format = 'brief' select * from t where a = 1 and b > 0 and b < 1000 and c > 1000").Check(testkit.Rows( + "TableReader 0.00 root data:Selection", + "└─Selection 0.00 cop[tikv] gt(test.t.c, 1000)", + " └─TableRangeScan 2.00 cop[tikv] table:t range:(1 0,1 1000), keep order:false")) +} + +func TestShowHistogramsLoadStatus(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + origLease := h.Lease() + h.SetLease(time.Second) + defer func() { h.SetLease(origLease) }() + tk.MustExec("use test") + tk.MustExec("create table t(a int primary key, b int, c int, index idx(b, c))") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values (1,2,3), (4,5,6)") + require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + tk.MustExec("analyze table t") + require.NoError(t, h.Update(dom.InfoSchema())) + rows := tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Rows() + for _, row := range rows { + if row[3] == "a" || row[3] == "idx" { + require.Equal(t, "allLoaded", row[10].(string)) + } else { + require.Equal(t, "allEvicted", row[10].(string)) + } + } +} + +func TestSingleColumnIndexNDV(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + tk.MustExec("use test") + tk.MustExec("create table t(a int, b int, c varchar(20), d varchar(20), index idx_a(a), index idx_b(b), index idx_c(c), index idx_d(d))") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values (1, 1, 'xxx', 'zzz'), (2, 2, 'yyy', 'zzz'), (1, 3, null, 'zzz')") + for i := 0; i < 5; i++ { + tk.MustExec("insert into t select * from t") + } + tk.MustExec("analyze table t") + rows := tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Sort().Rows() + expectedResults := [][]string{ + {"a", "2", "0"}, {"b", "3", "0"}, {"c", "2", "32"}, {"d", "1", "0"}, + {"idx_a", "2", "0"}, {"idx_b", "3", "0"}, {"idx_c", "2", "32"}, {"idx_d", "1", "0"}, + } + for i, row := range rows { + require.Equal(t, expectedResults[i][0], row[3]) // column_name + require.Equal(t, expectedResults[i][1], row[6]) // distinct_count + require.Equal(t, expectedResults[i][2], row[7]) // null_count + } +} + +func TestColumnStatsLazyLoad(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + originLease := h.Lease() + defer h.SetLease(originLease) + // Set `Lease` to `Millisecond` to enable column stats lazy load. + h.SetLease(time.Millisecond) + tk.MustExec("use test") + tk.MustExec("create table t(a int, b int)") + tk.MustExec("insert into t values (1,2), (3,4), (5,6), (7,8)") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("analyze table t") + is := dom.InfoSchema() + tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + require.NoError(t, err) + tblInfo := tbl.Meta() + c1 := tblInfo.Columns[0] + c2 := tblInfo.Columns[1] + require.True(t, h.GetTableStats(tblInfo).Columns[c1.ID].IsAllEvicted()) + require.True(t, h.GetTableStats(tblInfo).Columns[c2.ID].IsAllEvicted()) + tk.MustExec("analyze table t") + require.True(t, h.GetTableStats(tblInfo).Columns[c1.ID].IsAllEvicted()) + require.True(t, h.GetTableStats(tblInfo).Columns[c2.ID].IsAllEvicted()) +} + +func TestUpdateNotLoadIndexFMSketch(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + tk.MustExec("use test") + tk.MustExec("create table t(a int, b int, index idx(a)) partition by range (a) (partition p0 values less than (10),partition p1 values less than maxvalue)") + tk.MustExec("insert into t values (1,2), (3,4), (5,6), (7,8)") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("analyze table t") + is := dom.InfoSchema() + tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + require.NoError(t, err) + tblInfo := tbl.Meta() + idxInfo := tblInfo.Indices[0] + p0 := tblInfo.Partition.Definitions[0] + p1 := tblInfo.Partition.Definitions[1] + require.Nil(t, h.GetPartitionStats(tblInfo, p0.ID).Indices[idxInfo.ID].FMSketch) + require.Nil(t, h.GetPartitionStats(tblInfo, p1.ID).Indices[idxInfo.ID].FMSketch) + h.Clear() + require.NoError(t, h.Update(is)) + require.Nil(t, h.GetPartitionStats(tblInfo, p0.ID).Indices[idxInfo.ID].FMSketch) + require.Nil(t, h.GetPartitionStats(tblInfo, p1.ID).Indices[idxInfo.ID].FMSketch) +} +>>>>>>> cdab35847f8 (statistics: fix unnecessary index fmsketch loading (#42074)) diff --git a/statistics/interact_with_storage.go b/statistics/interact_with_storage.go new file mode 100644 index 0000000000000..8231b90dec5d2 --- /dev/null +++ b/statistics/interact_with_storage.go @@ -0,0 +1,484 @@ +// Copyright 2023 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statistics + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/tidb/kv" + "github.com/pingcap/tidb/parser/ast" + "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/parser/mysql" + "github.com/pingcap/tidb/sessionctx/stmtctx" + "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util/chunk" + "github.com/pingcap/tidb/util/logutil" + "github.com/pingcap/tidb/util/mathutil" + "github.com/pingcap/tidb/util/sqlexec" + "go.uber.org/zap" +) + +// StatsReader is used for simplifying code that needs to read statistics from system tables(mysql.stats_xxx) in different sqls +// but requires the same transactions. +// +// Note that: +// 1. Remember to call (*StatsReader).Close after reading all statistics. +// 2. StatsReader is not thread-safe. Different goroutines cannot call (*StatsReader).Read concurrently. +type StatsReader struct { + ctx sqlexec.RestrictedSQLExecutor + snapshot uint64 +} + +// GetStatsReader returns a StatsReader. +func GetStatsReader(snapshot uint64, exec sqlexec.RestrictedSQLExecutor) (reader *StatsReader, err error) { + failpoint.Inject("mockGetStatsReaderFail", func(val failpoint.Value) { + if val.(bool) { + failpoint.Return(nil, errors.New("gofail genStatsReader error")) + } + }) + if snapshot > 0 { + return &StatsReader{ctx: exec, snapshot: snapshot}, nil + } + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("getStatsReader panic %v", r) + } + }() + ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats) + failpoint.Inject("mockGetStatsReaderPanic", nil) + _, err = exec.(sqlexec.SQLExecutor).ExecuteInternal(ctx, "begin") + if err != nil { + return nil, err + } + return &StatsReader{ctx: exec}, nil +} + +// Read is a thin wrapper reading statistics from storage by sql command. +func (sr *StatsReader) Read(sql string, args ...interface{}) (rows []chunk.Row, fields []*ast.ResultField, err error) { + ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats) + if sr.snapshot > 0 { + return sr.ctx.ExecRestrictedSQL(ctx, []sqlexec.OptionFuncAlias{sqlexec.ExecOptionUseSessionPool, sqlexec.ExecOptionWithSnapshot(sr.snapshot)}, sql, args...) + } + return sr.ctx.ExecRestrictedSQL(ctx, []sqlexec.OptionFuncAlias{sqlexec.ExecOptionUseCurSession}, sql, args...) +} + +// IsHistory indicates whether to read history statistics. +func (sr *StatsReader) IsHistory() bool { + return sr.snapshot > 0 +} + +// Close closes the StatsReader. +func (sr *StatsReader) Close() error { + if sr.IsHistory() || sr.ctx == nil { + return nil + } + ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats) + _, err := sr.ctx.(sqlexec.SQLExecutor).ExecuteInternal(ctx, "commit") + return err +} + +// HistogramFromStorage reads histogram from storage. +func HistogramFromStorage(reader *StatsReader, tableID int64, colID int64, tp *types.FieldType, distinct int64, isIndex int, ver uint64, nullCount int64, totColSize int64, corr float64) (_ *Histogram, err error) { + rows, fields, err := reader.Read("select count, repeats, lower_bound, upper_bound, ndv from mysql.stats_buckets where table_id = %? and is_index = %? and hist_id = %? order by bucket_id", tableID, isIndex, colID) + if err != nil { + return nil, errors.Trace(err) + } + bucketSize := len(rows) + hg := NewHistogram(colID, distinct, nullCount, ver, tp, bucketSize, totColSize) + hg.Correlation = corr + totalCount := int64(0) + for i := 0; i < bucketSize; i++ { + count := rows[i].GetInt64(0) + repeats := rows[i].GetInt64(1) + var upperBound, lowerBound types.Datum + if isIndex == 1 { + lowerBound = rows[i].GetDatum(2, &fields[2].Column.FieldType) + upperBound = rows[i].GetDatum(3, &fields[3].Column.FieldType) + } else { + // Invalid date values may be inserted into table under some relaxed sql mode. Those values may exist in statistics. + // Hence, when reading statistics, we should skip invalid date check. See #39336. + sc := &stmtctx.StatementContext{TimeZone: time.UTC, AllowInvalidDate: true, IgnoreZeroInDate: true} + d := rows[i].GetDatum(2, &fields[2].Column.FieldType) + // For new collation data, when storing the bounds of the histogram, we store the collate key instead of the + // original value. + // But there's additional conversion logic for new collation data, and the collate key might be longer than + // the FieldType.flen. + // If we use the original FieldType here, there might be errors like "Invalid utf8mb4 character string" + // or "Data too long". + // So we change it to TypeBlob to bypass those logics here. + if tp.EvalType() == types.ETString && tp.GetType() != mysql.TypeEnum && tp.GetType() != mysql.TypeSet { + tp = types.NewFieldType(mysql.TypeBlob) + } + lowerBound, err = d.ConvertTo(sc, tp) + if err != nil { + return nil, errors.Trace(err) + } + d = rows[i].GetDatum(3, &fields[3].Column.FieldType) + upperBound, err = d.ConvertTo(sc, tp) + if err != nil { + return nil, errors.Trace(err) + } + } + totalCount += count + hg.AppendBucketWithNDV(&lowerBound, &upperBound, totalCount, repeats, rows[i].GetInt64(4)) + } + hg.PreCalculateScalar() + return hg, nil +} + +// CMSketchAndTopNFromStorage reads CMSketch and TopN from storage. +func CMSketchAndTopNFromStorage(reader *StatsReader, tblID int64, isIndex, histID int64) (_ *CMSketch, _ *TopN, err error) { + topNRows, _, err := reader.Read("select HIGH_PRIORITY value, count from mysql.stats_top_n where table_id = %? and is_index = %? and hist_id = %?", tblID, isIndex, histID) + if err != nil { + return nil, nil, err + } + rows, _, err := reader.Read("select cm_sketch from mysql.stats_histograms where table_id = %? and is_index = %? and hist_id = %?", tblID, isIndex, histID) + if err != nil { + return nil, nil, err + } + if len(rows) == 0 { + return DecodeCMSketchAndTopN(nil, topNRows) + } + return DecodeCMSketchAndTopN(rows[0].GetBytes(0), topNRows) +} + +// FMSketchFromStorage reads FMSketch from storage +func FMSketchFromStorage(reader *StatsReader, tblID int64, isIndex, histID int64) (_ *FMSketch, err error) { + rows, _, err := reader.Read("select value from mysql.stats_fm_sketch where table_id = %? and is_index = %? and hist_id = %?", tblID, isIndex, histID) + if err != nil || len(rows) == 0 { + return nil, err + } + return DecodeFMSketch(rows[0].GetBytes(0)) +} + +// ColumnCountFromStorage reads column count from storage +func ColumnCountFromStorage(reader *StatsReader, tableID, colID, statsVer int64) (int64, error) { + count := int64(0) + rows, _, err := reader.Read("select sum(count) from mysql.stats_buckets where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID) + if err != nil { + return 0, errors.Trace(err) + } + // If there doesn't exist any buckets, the SQL will return NULL. So we only use the result if it's not NULL. + if !rows[0].IsNull(0) { + count, err = rows[0].GetMyDecimal(0).ToInt() + if err != nil { + return 0, errors.Trace(err) + } + } + + if statsVer >= Version2 { + // Before stats ver 2, histogram represents all data in this column. + // In stats ver 2, histogram + TopN represent all data in this column. + // So we need to add TopN total count here. + rows, _, err = reader.Read("select sum(count) from mysql.stats_top_n where table_id = %? and is_index = 0 and hist_id = %?", tableID, colID) + if err != nil { + return 0, errors.Trace(err) + } + if !rows[0].IsNull(0) { + topNCount, err := rows[0].GetMyDecimal(0).ToInt() + if err != nil { + return 0, errors.Trace(err) + } + count += topNCount + } + } + return count, err +} + +// ExtendedStatsFromStorage reads extended stats from storage. +func ExtendedStatsFromStorage(reader *StatsReader, table *Table, physicalID int64, loadAll bool) (*Table, error) { + failpoint.Inject("injectExtStatsLoadErr", func() { + failpoint.Return(nil, errors.New("gofail extendedStatsFromStorage error")) + }) + lastVersion := uint64(0) + if table.ExtendedStats != nil && !loadAll { + lastVersion = table.ExtendedStats.LastUpdateVersion + } else { + table.ExtendedStats = NewExtendedStatsColl() + } + rows, _, err := reader.Read("select name, status, type, column_ids, stats, version from mysql.stats_extended where table_id = %? and status in (%?, %?, %?) and version > %?", physicalID, ExtendedStatsInited, ExtendedStatsAnalyzed, ExtendedStatsDeleted, lastVersion) + if err != nil || len(rows) == 0 { + return table, nil + } + for _, row := range rows { + lastVersion = mathutil.Max(lastVersion, row.GetUint64(5)) + name := row.GetString(0) + status := uint8(row.GetInt64(1)) + if status == ExtendedStatsDeleted || status == ExtendedStatsInited { + delete(table.ExtendedStats.Stats, name) + } else { + item := &ExtendedStatsItem{ + Tp: uint8(row.GetInt64(2)), + } + colIDs := row.GetString(3) + err := json.Unmarshal([]byte(colIDs), &item.ColIDs) + if err != nil { + logutil.BgLogger().Error("[stats] decode column IDs failed", zap.String("column_ids", colIDs), zap.Error(err)) + return nil, err + } + statsStr := row.GetString(4) + if item.Tp == ast.StatsTypeCardinality || item.Tp == ast.StatsTypeCorrelation { + if statsStr != "" { + item.ScalarVals, err = strconv.ParseFloat(statsStr, 64) + if err != nil { + logutil.BgLogger().Error("[stats] parse scalar stats failed", zap.String("stats", statsStr), zap.Error(err)) + return nil, err + } + } + } else { + item.StringVals = statsStr + } + table.ExtendedStats.Stats[name] = item + } + } + table.ExtendedStats.LastUpdateVersion = lastVersion + return table, nil +} + +func indexStatsFromStorage(reader *StatsReader, row chunk.Row, table *Table, tableInfo *model.TableInfo, loadAll bool) error { + histID := row.GetInt64(2) + distinct := row.GetInt64(3) + histVer := row.GetUint64(4) + nullCount := row.GetInt64(5) + statsVer := row.GetInt64(7) + idx := table.Indices[histID] + errorRate := ErrorRate{} + flag := row.GetInt64(8) + lastAnalyzePos := row.GetDatum(10, types.NewFieldType(mysql.TypeBlob)) + if (!IsAnalyzed(flag) || reader.IsHistory()) && idx != nil { + errorRate = idx.ErrorRate + } + for _, idxInfo := range tableInfo.Indices { + if histID != idxInfo.ID { + continue + } + if idx == nil || idx.LastUpdateVersion < histVer { + hg, err := HistogramFromStorage(reader, table.PhysicalID, histID, types.NewFieldType(mysql.TypeBlob), distinct, 1, histVer, nullCount, 0, 0) + if err != nil { + return errors.Trace(err) + } + cms, topN, err := CMSketchAndTopNFromStorage(reader, table.PhysicalID, 1, idxInfo.ID) + if err != nil { + return errors.Trace(err) + } + var fmSketch *FMSketch + if loadAll { + // FMSketch is only used when merging partition stats into global stats. When merging partition stats into global stats, + // we load all the statistics, i.e., loadAll is true. + fmSketch, err = FMSketchFromStorage(reader, table.PhysicalID, 1, histID) + if err != nil { + return errors.Trace(err) + } + } + idx = &Index{ + Histogram: *hg, + CMSketch: cms, + TopN: topN, + FMSketch: fmSketch, + Info: idxInfo, + ErrorRate: errorRate, + StatsVer: statsVer, + Flag: flag, + PhysicalID: table.PhysicalID, + } + if statsVer != Version0 { + idx.StatsLoadedStatus = NewStatsFullLoadStatus() + } + lastAnalyzePos.Copy(&idx.LastAnalyzePos) + } + break + } + if idx != nil { + table.Indices[histID] = idx + } else { + logutil.BgLogger().Debug("we cannot find index id in table info. It may be deleted.", zap.Int64("indexID", histID), zap.String("table", tableInfo.Name.O)) + } + return nil +} + +func columnStatsFromStorage(reader *StatsReader, row chunk.Row, table *Table, tableInfo *model.TableInfo, loadAll bool, lease time.Duration) error { + histID := row.GetInt64(2) + distinct := row.GetInt64(3) + histVer := row.GetUint64(4) + nullCount := row.GetInt64(5) + totColSize := row.GetInt64(6) + statsVer := row.GetInt64(7) + correlation := row.GetFloat64(9) + lastAnalyzePos := row.GetDatum(10, types.NewFieldType(mysql.TypeBlob)) + col := table.Columns[histID] + errorRate := ErrorRate{} + flag := row.GetInt64(8) + if (!IsAnalyzed(flag) || reader.IsHistory()) && col != nil { + errorRate = col.ErrorRate + } + for _, colInfo := range tableInfo.Columns { + if histID != colInfo.ID { + continue + } + isHandle := tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()) + // We will not load buckets if: + // 1. lease > 0, and: + // 2. this column is not handle, and: + // 3. the column doesn't has any statistics before, and: + // 4. loadAll is false. + // + // Here is the explanation of the condition `!col.IsStatsInitialized() || col.IsAllEvicted()`. + // For one column: + // 1. If there is no stats for it in the storage(i.e., analyze has never been executed before), then its stats status + // would be `!col.IsStatsInitialized()`. In this case we should go the `notNeedLoad` path. + // 2. If there exists stats for it in the storage but its stats status is `col.IsAllEvicted()`, there are two + // sub cases for this case. One is that the column stats have never been used/needed by the optimizer so they have + // never been loaded. The other is that the column stats were loaded and then evicted. For the both sub cases, + // we should go the `notNeedLoad` path. + // 3. If some parts(Histogram/TopN/CMSketch) of stats for it exist in TiDB memory currently, we choose to load all of + // its new stats once we find stats version is updated. + notNeedLoad := lease > 0 && + !isHandle && + (col == nil || ((!col.IsStatsInitialized() || col.IsAllEvicted()) && col.LastUpdateVersion < histVer)) && + !loadAll + // Here is + //For one column, if there is no stats for it in the storage(analyze is never) + if notNeedLoad { + count, err := ColumnCountFromStorage(reader, table.PhysicalID, histID, statsVer) + if err != nil { + return errors.Trace(err) + } + col = &Column{ + PhysicalID: table.PhysicalID, + Histogram: *NewHistogram(histID, distinct, nullCount, histVer, &colInfo.FieldType, 0, totColSize), + Info: colInfo, + Count: count + nullCount, + ErrorRate: errorRate, + IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()), + Flag: flag, + StatsVer: statsVer, + } + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add col.Count > 0 here. + if statsVer != Version0 || col.Count > 0 { + col.StatsLoadedStatus = NewStatsAllEvictedStatus() + } + lastAnalyzePos.Copy(&col.LastAnalyzePos) + col.Histogram.Correlation = correlation + break + } + if col == nil || col.LastUpdateVersion < histVer || loadAll { + hg, err := HistogramFromStorage(reader, table.PhysicalID, histID, &colInfo.FieldType, distinct, 0, histVer, nullCount, totColSize, correlation) + if err != nil { + return errors.Trace(err) + } + cms, topN, err := CMSketchAndTopNFromStorage(reader, table.PhysicalID, 0, colInfo.ID) + if err != nil { + return errors.Trace(err) + } + var fmSketch *FMSketch + if loadAll { + // FMSketch is only used when merging partition stats into global stats. When merging partition stats into global stats, + // we load all the statistics, i.e., loadAll is true. + fmSketch, err = FMSketchFromStorage(reader, table.PhysicalID, 0, histID) + if err != nil { + return errors.Trace(err) + } + } + col = &Column{ + PhysicalID: table.PhysicalID, + Histogram: *hg, + Info: colInfo, + CMSketch: cms, + TopN: topN, + FMSketch: fmSketch, + ErrorRate: errorRate, + IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()), + Flag: flag, + StatsVer: statsVer, + } + // Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col. + col.Count = int64(col.TotalRowCount()) + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add colHist.Count > 0 here. + if statsVer != Version0 || col.Count > 0 { + col.StatsLoadedStatus = NewStatsFullLoadStatus() + } + lastAnalyzePos.Copy(&col.LastAnalyzePos) + break + } + if col.TotColSize != totColSize { + newCol := *col + newCol.TotColSize = totColSize + col = &newCol + } + break + } + if col != nil { + table.Columns[col.ID] = col + } else { + // If we didn't find a Column or Index in tableInfo, we won't load the histogram for it. + // But don't worry, next lease the ddl will be updated, and we will load a same table for two times to + // avoid error. + logutil.BgLogger().Debug("we cannot find column in table info now. It may be deleted", zap.Int64("colID", histID), zap.String("table", tableInfo.Name.O)) + } + return nil +} + +// TableStatsFromStorage loads table stats info from storage. +func TableStatsFromStorage(reader *StatsReader, tableInfo *model.TableInfo, physicalID int64, loadAll bool, lease time.Duration, table *Table) (_ *Table, err error) { + // If table stats is pseudo, we also need to copy it, since we will use the column stats when + // the average error rate of it is small. + if table == nil || reader.IsHistory() { + histColl := HistColl{ + PhysicalID: physicalID, + HavePhysicalID: true, + Columns: make(map[int64]*Column, len(tableInfo.Columns)), + Indices: make(map[int64]*Index, len(tableInfo.Indices)), + } + table = &Table{ + HistColl: histColl, + } + } else { + // We copy it before writing to avoid race. + table = table.Copy() + } + table.Pseudo = false + + rows, _, err := reader.Read("select modify_count, count from mysql.stats_meta where table_id = %?", physicalID) + if err != nil || len(rows) == 0 { + return nil, err + } + table.ModifyCount = rows[0].GetInt64(0) + table.Count = rows[0].GetInt64(1) + + rows, _, err = reader.Read("select table_id, is_index, hist_id, distinct_count, version, null_count, tot_col_size, stats_ver, flag, correlation, last_analyze_pos from mysql.stats_histograms where table_id = %?", physicalID) + // Check deleted table. + if err != nil || len(rows) == 0 { + return nil, nil + } + for _, row := range rows { + if row.GetInt64(1) > 0 { + err = indexStatsFromStorage(reader, row, table, tableInfo, loadAll) + } else { + err = columnStatsFromStorage(reader, row, table, tableInfo, loadAll, lease) + } + if err != nil { + return nil, err + } + } + return ExtendedStatsFromStorage(reader, table, physicalID, loadAll) +}