Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

statistics: fix some problem related to stats async load (#57723) #57775

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/statistics/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 37,
shard_count = 38,
deps = [
"//pkg/config",
"//pkg/meta/model",
Expand Down
10 changes: 10 additions & 0 deletions pkg/statistics/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,13 @@ func (c *Column) StatsAvailable() bool {
// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
}

// EmptyColumn creates an empty column object. It may be used for pseudo estimation or to stop loading unexisting stats.
func EmptyColumn(tid int64, pkIsHandle bool, colInfo *model.ColumnInfo) *Column {
return &Column{
PhysicalID: tid,
Info: colInfo,
Histogram: *NewHistogram(colInfo.ID, 0, 0, 0, &colInfo.FieldType, 0, 0),
IsHandle: pkIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
}
}
46 changes: 27 additions & 19 deletions pkg/statistics/handle/storage/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,30 +632,38 @@ func CleanFakeItemsForShowHistInFlights(statsCache statstypes.StatsCache) int {
}

func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.StatsHandle, col model.TableItemID, loadFMSketch bool, fullLoad bool) (err error) {
tbl, ok := statsHandle.Get(col.TableID)
statsTbl, ok := statsHandle.Get(col.TableID)
if !ok {
return nil
}

var colInfo *model.ColumnInfo
_, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

// Now, we cannot init the column info in the ColAndIdxExistenceMap when to disable lite-init-stats.
// so we have to get the column info from the domain.
is := sctx.GetDomainInfoSchema().(infoschema.InfoSchema)
tblInfo, ok := statsHandle.TableInfoByID(is, col.TableID)
tbl, ok := statsHandle.TableInfoByID(is, col.TableID)
if !ok {
return nil
}
colInfo = tblInfo.Meta().GetColumnByID(col.ID)
tblInfo := tbl.Meta()
colInfo := tblInfo.GetColumnByID(col.ID)
if colInfo == nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

_, loadNeeded, analyzed := statsTbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
fakeCol := statistics.EmptyColumn(tblInfo.ID, tblInfo.PKIsHandle, colInfo)
statsTbl.SetCol(col.ID, fakeCol)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
}
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}

hg, _, statsVer, _, err := HistMetaFromStorageWithHighPriority(sctx, &col, colInfo)
if hg == nil || err != nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
Expand Down Expand Up @@ -690,29 +698,29 @@ func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: tblInfo.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
StatsVer: statsVer,
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
// like `GetPartitionStats` called in `fmSketchFromStorage` would have modified the stats cache already.
tbl, ok = statsHandle.Get(col.TableID)
statsTbl, ok = statsHandle.Get(col.TableID)
if !ok {
return nil
}
tbl = tbl.Copy()
statsTbl = statsTbl.Copy()
if colHist.StatsAvailable() {
if fullLoad {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
} else {
colHist.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
if statsVer != statistics.Version0 {
tbl.StatsVer = int(statsVer)
statsTbl.LastAnalyzeVersion = max(statsTbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
statsTbl.StatsVer = int(statsVer)
}
}
tbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
statsTbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
if col.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.",
Expand Down Expand Up @@ -771,9 +779,9 @@ func loadNeededIndexHistograms(sctx sessionctx.Context, is infoschema.InfoSchema
tbl = tbl.Copy()
if idxHist.StatsVer != statistics.Version0 {
tbl.StatsVer = int(idxHist.StatsVer)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
}
tbl.SetIdx(idx.ID, idxHist)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
if idx.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for index should already be loaded as sync but not found.",
Expand Down
8 changes: 2 additions & 6 deletions pkg/statistics/handle/syncload/stats_syncload.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,9 @@ func (s *statsSyncLoad) handleOneItemTask(task *statstypes.NeededItemTask) (err
}
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
wrapper.col = &statistics.Column{
PhysicalID: item.TableID,
Info: wrapper.colInfo,
Histogram: *statistics.NewHistogram(item.ID, 0, 0, 0, &wrapper.colInfo.FieldType, 0, 0),
IsHandle: isPkIsHandle && mysql.HasPriKeyFlag(wrapper.colInfo.GetFlag()),
}
wrapper.col = statistics.EmptyColumn(item.TableID, isPkIsHandle, wrapper.colInfo)
s.updateCachedItem(tblInfo, item, wrapper.col, wrapper.idx, task.Item.FullLoad)
return nil
}
Expand Down
22 changes: 22 additions & 0 deletions pkg/statistics/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,25 @@ func TestGlobalIndexWithAnalyzeVersion1AndHistoricalStats(t *testing.T) {
// Each analyze will only generate one record
tk.MustQuery(fmt.Sprintf("select count(*) from mysql.stats_history where table_id=%d", tblID)).Equal(testkit.Rows("10"))
}

func TestLastAnalyzeVersionNotChangedWithAsyncStatsLoad(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)

tk.MustExec("set @@tidb_stats_load_sync_wait = 0;")
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int);")
require.NoError(t, dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh()))
require.NoError(t, dom.StatsHandle().Update(context.Background(), dom.InfoSchema()))
tk.MustExec("insert into t values (1, 1);")
err := dom.StatsHandle().DumpStatsDeltaToKV(true)
require.NoError(t, err)
tk.MustExec("alter table t add column c int default 1;")
dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh())
tk.MustExec("select * from t where a = 1 or b = 1 or c = 1;")
require.NoError(t, dom.StatsHandle().LoadNeededHistograms(dom.InfoSchema()))
result := tk.MustQuery("show stats_meta where table_name = 't'")
require.Len(t, result.Rows(), 1)
// The last analyze time.
require.Equal(t, "<nil>", result.Rows()[0][6])
}
13 changes: 7 additions & 6 deletions pkg/statistics/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ func (t *Table) GetStatsHealthy() (int64, bool) {
}

// ColumnIsLoadNeeded checks whether the column needs trigger the async/sync load.
// The Column should be visible in the table and really has analyzed statistics in the stroage.
// The Column should be visible in the table and really has analyzed statistics in the storage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Column together with the checking result, to avoid accessing the map multiple times.
// The first bool is whether we need to load it into memory. The second bool is whether this column has stats in the system table or not.
Expand All @@ -820,23 +820,24 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool
return nil, false, false
}
// when we use non-lite init stats, it cannot init the stats for common columns.
// so we need to foce to load the stats.
// so we need to force to load the stats.
col, ok := t.columns[id]
if !ok {
return nil, true, true
}
hasAnalyzed := t.ColAndIdxExistenceMap.HasAnalyzed(id, false)

// If it's not analyzed yet.
// The real check condition: !ok && !hashAnalyzed.
// After this check, we will always have ok && hasAnalyzed.
if !hasAnalyzed {
return nil, false, false
}

// Restore the condition from the simplified form:
// 1. !ok && hasAnalyzed => need load
// 2. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 3. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if !ok || (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
// 1. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 2. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
return col, true, true
}

Expand Down