Skip to content

Commit

Permalink
This is an automated cherry-pick of pingcap#57723
Browse files Browse the repository at this point in the history
Signed-off-by: ti-chi-bot <[email protected]>
  • Loading branch information
winoros authored and ti-chi-bot committed Nov 27, 2024
1 parent 5f4110d commit 2485a65
Show file tree
Hide file tree
Showing 6 changed files with 3,505 additions and 0 deletions.
113 changes: 113 additions & 0 deletions pkg/statistics/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "statistics",
srcs = [
"analyze.go",
"analyze_jobs.go",
"builder.go",
"builder_ext_stats.go",
"cmsketch.go",
"cmsketch_util.go",
"column.go",
"debugtrace.go",
"estimate.go",
"fmsketch.go",
"histogram.go",
"index.go",
"row_sampler.go",
"sample.go",
"scalar.go",
"table.go",
],
importpath = "github.com/pingcap/tidb/pkg/statistics",
visibility = ["//visibility:public"],
deps = [
"//pkg/expression",
"//pkg/kv",
"//pkg/meta/model",
"//pkg/parser/ast",
"//pkg/parser/charset",
"//pkg/parser/mysql",
"//pkg/parser/terror",
"//pkg/planner/core/resolve",
"//pkg/planner/planctx",
"//pkg/planner/util/debugtrace",
"//pkg/sessionctx",
"//pkg/sessionctx/stmtctx",
"//pkg/sessionctx/variable",
"//pkg/statistics/asyncload",
"//pkg/statistics/handle/logutil",
"//pkg/tablecodec",
"//pkg/types",
"//pkg/util/chunk",
"//pkg/util/codec",
"//pkg/util/collate",
"//pkg/util/context",
"//pkg/util/dbterror",
"//pkg/util/fastrand",
"//pkg/util/hack",
"//pkg/util/intest",
"//pkg/util/logutil",
"//pkg/util/memory",
"//pkg/util/ranger",
"//pkg/util/sqlexec",
"@com_github_dolthub_swiss//:swiss",
"@com_github_pingcap_errors//:errors",
"@com_github_pingcap_failpoint//:failpoint",
"@com_github_pingcap_tipb//go-tipb",
"@com_github_twmb_murmur3//:murmur3",
"@org_golang_x_exp//maps",
"@org_uber_go_atomic//:atomic",
"@org_uber_go_zap//:zap",
],
)

go_test(
name = "statistics_test",
timeout = "short",
srcs = [
"bench_daily_test.go",
"builder_test.go",
"cmsketch_test.go",
"fmsketch_test.go",
"histogram_bench_test.go",
"histogram_test.go",
"integration_test.go",
"main_test.go",
"sample_test.go",
"scalar_test.go",
"statistics_test.go",
],
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 38,
deps = [
"//pkg/config",
"//pkg/meta/model",
"//pkg/parser/model",
"//pkg/parser/mysql",
"//pkg/planner/core/resolve",
"//pkg/sessionctx",
"//pkg/sessionctx/stmtctx",
"//pkg/testkit",
"//pkg/testkit/analyzehelper",
"//pkg/testkit/testdata",
"//pkg/testkit/testmain",
"//pkg/testkit/testsetup",
"//pkg/types",
"//pkg/util/benchdaily",
"//pkg/util/chunk",
"//pkg/util/codec",
"//pkg/util/collate",
"//pkg/util/memory",
"//pkg/util/mock",
"//pkg/util/ranger",
"//pkg/util/sqlexec",
"@com_github_pingcap_errors//:errors",
"@com_github_pingcap_failpoint//:failpoint",
"@com_github_stretchr_testify//require",
"@org_uber_go_goleak//:goleak",
],
)
272 changes: 272 additions & 0 deletions pkg/statistics/column.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
"github.com/pingcap/tidb/pkg/meta/model"
"github.com/pingcap/tidb/pkg/parser/mysql"
"github.com/pingcap/tidb/pkg/planner/planctx"
"github.com/pingcap/tidb/pkg/planner/util/debugtrace"
"github.com/pingcap/tidb/pkg/statistics/asyncload"
"github.com/pingcap/tidb/pkg/types"
"github.com/pingcap/tidb/pkg/util/chunk"
)

// Column represents a column histogram.
type Column struct {
LastAnalyzePos types.Datum
CMSketch *CMSketch
TopN *TopN
FMSketch *FMSketch
Info *model.ColumnInfo
Histogram

// StatsLoadedStatus indicates the status of column statistics
StatsLoadedStatus
// PhysicalID is the physical table id,
// or it could possibly be -1, which means "stats not available".
// The -1 case could happen in a pseudo stats table, and in this case, this stats should not trigger stats loading.
PhysicalID int64
Flag int64
StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility

IsHandle bool
}

// Copy copies the column.
func (c *Column) Copy() *Column {
if c == nil {
return nil
}
nc := &Column{
PhysicalID: c.PhysicalID,
Flag: c.Flag,
StatsVer: c.StatsVer,
IsHandle: c.IsHandle,
}
c.LastAnalyzePos.Copy(&nc.LastAnalyzePos)
if c.CMSketch != nil {
nc.CMSketch = c.CMSketch.Copy()
}
if c.TopN != nil {
nc.TopN = c.TopN.Copy()
}
if c.FMSketch != nil {
nc.FMSketch = c.FMSketch.Copy()
}
if c.Info != nil {
nc.Info = c.Info.Clone()
}
nc.Histogram = *c.Histogram.Copy()
nc.StatsLoadedStatus = c.StatsLoadedStatus.Copy()
return nc
}

func (c *Column) String() string {
return c.Histogram.ToString(0)
}

// TotalRowCount returns the total count of this column.
func (c *Column) TotalRowCount() float64 {
if c.StatsVer >= Version2 {
return c.Histogram.TotalRowCount() + float64(c.TopN.TotalCount())
}
return c.Histogram.TotalRowCount()
}

// NotNullCount returns the count of this column which is not null.
func (c *Column) NotNullCount() float64 {
if c.StatsVer >= Version2 {
return c.Histogram.NotNullCount() + float64(c.TopN.TotalCount())
}
return c.Histogram.NotNullCount()
}

// GetIncreaseFactor get the increase factor to adjust the final estimated count when the table is modified.
func (c *Column) GetIncreaseFactor(realtimeRowCount int64) float64 {
columnCount := c.TotalRowCount()
if columnCount == 0 {
// avoid dividing by 0
return 1.0
}
return float64(realtimeRowCount) / columnCount
}

// MemoryUsage returns the total memory usage of Histogram, CMSketch, FMSketch in Column.
// We ignore the size of other metadata in Column
func (c *Column) MemoryUsage() CacheItemMemoryUsage {
var sum int64
columnMemUsage := &ColumnMemUsage{
ColumnID: c.Info.ID,
}
histogramMemUsage := c.Histogram.MemoryUsage()
columnMemUsage.HistogramMemUsage = histogramMemUsage
sum = histogramMemUsage
if c.CMSketch != nil {
cmSketchMemUsage := c.CMSketch.MemoryUsage()
columnMemUsage.CMSketchMemUsage = cmSketchMemUsage
sum += cmSketchMemUsage
}
if c.TopN != nil {
topnMemUsage := c.TopN.MemoryUsage()
columnMemUsage.TopNMemUsage = topnMemUsage
sum += topnMemUsage
}
if c.FMSketch != nil {
fmSketchMemUsage := c.FMSketch.MemoryUsage()
columnMemUsage.FMSketchMemUsage = fmSketchMemUsage
sum += fmSketchMemUsage
}
columnMemUsage.TotalMemUsage = sum
return columnMemUsage
}

// ColumnStatsIsInvalid checks if this column is invalid.
// If this column has histogram but not loaded yet,
// then we mark it as need histogram.
func ColumnStatsIsInvalid(colStats *Column, sctx planctx.PlanContext, histColl *HistColl, cid int64) (res bool) {
var totalCount float64
var ndv int64
var inValidForCollPseudo, essentialLoaded bool
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
defer func() {
debugtrace.RecordAnyValuesWithNames(sctx,
"IsInvalid", res,
"InValidForCollPseudo", inValidForCollPseudo,
"TotalCount", totalCount,
"NDV", ndv,
"EssentialLoaded", essentialLoaded,
)
debugtrace.LeaveContextCommon(sctx)
}()
}
if sctx != nil {
stmtctx := sctx.GetSessionVars().StmtCtx
if (colStats == nil || !colStats.IsStatsInitialized() || colStats.IsLoadNeeded()) &&
stmtctx != nil &&
!histColl.CanNotTriggerLoad {
asyncload.AsyncLoadHistogramNeededItems.Insert(model.TableItemID{
TableID: histColl.PhysicalID,
ID: cid,
IsIndex: false,
IsSyncLoadFailed: sctx.GetSessionVars().StmtCtx.StatsLoad.Timeout > 0,
}, true)
}
}
if histColl.Pseudo {
inValidForCollPseudo = true
return true
}
if colStats == nil {
totalCount = -1
ndv = -1
essentialLoaded = false
return true
}
// In some cases, some statistics in column would be evicted
// For example: the cmsketch of the column might be evicted while the histogram and the topn are still exists
// In this case, we will think this column as valid due to we can still use the rest of the statistics to do optimize.
totalCount = colStats.TotalRowCount()
essentialLoaded = colStats.IsEssentialStatsLoaded()
ndv = colStats.Histogram.NDV
return totalCount == 0 || (!essentialLoaded && ndv > 0)
}

// ItemID implements TableCacheItem
func (c *Column) ItemID() int64 {
return c.Info.ID
}

// DropUnnecessaryData drops the unnecessary data for the column.
func (c *Column) DropUnnecessaryData() {
if c.StatsVer < Version2 {
c.CMSketch = nil
}
c.TopN = nil
c.Histogram.Bounds = chunk.NewChunkWithCapacity([]*types.FieldType{types.NewFieldType(mysql.TypeBlob)}, 0)
c.Histogram.Buckets = make([]Bucket, 0)
c.Histogram.Scalars = make([]scalar, 0)
c.evictedStatus = AllEvicted
}

// IsAllEvicted indicates whether all stats evicted
func (c *Column) IsAllEvicted() bool {
return c == nil || (c.statsInitialized && c.evictedStatus >= AllEvicted)
}

// GetEvictedStatus indicates the evicted status
func (c *Column) GetEvictedStatus() int {
return c.evictedStatus
}

// IsStatsInitialized indicates whether stats is initialized
func (c *Column) IsStatsInitialized() bool {
return c.statsInitialized
}

// GetStatsVer indicates the stats version
func (c *Column) GetStatsVer() int64 {
return c.StatsVer
}

// IsCMSExist indicates whether CMSketch exists
func (c *Column) IsCMSExist() bool {
return c.CMSketch != nil
}

// StatusToString gets the string info of StatsLoadedStatus
func (s StatsLoadedStatus) StatusToString() string {
if !s.statsInitialized {
return "unInitialized"
}
switch s.evictedStatus {
case AllLoaded:
return "allLoaded"
case AllEvicted:
return "allEvicted"
}
return "unknown"
}

// IsAnalyzed indicates whether the column is analyzed.
// The set of IsAnalyzed columns is a subset of the set of StatsAvailable columns.
func (c *Column) IsAnalyzed() bool {
return c.GetStatsVer() != Version0
}

// StatsAvailable indicates whether the column stats are collected.
// Note:
// 1. The function merely talks about whether the stats are collected, regardless of the stats loaded status.
// 2. The function is used to decide StatsLoadedStatus.statsInitialized when reading the column stats from storage.
// 3. There are two cases that StatsAvailable is true:
// a. IsAnalyzed is true.
// b. The column is newly-added/modified and its stats are generated according to the default value.
func (c *Column) StatsAvailable() bool {
// Typically, when the column is analyzed, StatsVer is set to Version1/Version2, so we check IsAnalyzed().
// However, when we add/modify a column, its stats are generated according to the default value without setting
// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
}

// EmptyColumn creates an empty column object. It may be used for pseudo estimation or to stop loading unexisting stats.
func EmptyColumn(tid int64, pkIsHandle bool, colInfo *model.ColumnInfo) *Column {
return &Column{
PhysicalID: tid,
Info: colInfo,
Histogram: *NewHistogram(colInfo.ID, 0, 0, 0, &colInfo.FieldType, 0, 0),
IsHandle: pkIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
}
}
Loading

0 comments on commit 2485a65

Please sign in to comment.