From 3bc0992225673c2e8c80288a5c619d78f8b928cd Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Wed, 11 Jan 2023 22:41:51 -0800 Subject: [PATCH] sql/stats: include partial stats in results of statsCache.GetTableStats We were not including partial stats in the list of table statistics returned by `statsCache.GetTableStats`. This was fine for the optimizer, which currently cannot use partial stats directly, but it was a problem for backup. We'd like to use partial stats directly in the optimizer eventually, so this commit goes ahead and adds them to the results of `GetTableStats`. The optimizer then must filter them out. To streamline this we add some helper functions. Finally, in an act of overzealous refactoring, this commit also changes `MergedStatistics` and `ForecastTableStatistics` to accept partial statistics and full statistics mixed together in the same input list. This simplifies the code that calls these functions. Fixes: #95056 Part of: #93983 Epic: CRDB-19449 Release note: None --- .../testdata/backup-restore/metadata | 5 +- pkg/sql/distsql_plan_stats.go | 25 ++++---- pkg/sql/opt/cat/table.go | 15 ++++- pkg/sql/opt/exec/execbuilder/relational.go | 24 ++++--- pkg/sql/opt/memo/statistics_builder.go | 13 ++-- pkg/sql/opt/testutils/testcat/BUILD.bazel | 1 - pkg/sql/opt/testutils/testcat/test_catalog.go | 18 +++++- pkg/sql/opt_catalog.go | 18 +++++- pkg/sql/show_stats.go | 63 ++++++++----------- pkg/sql/stats/automatic_stats.go | 15 +++-- pkg/sql/stats/automatic_stats_test.go | 8 +-- pkg/sql/stats/forecast.go | 44 +++++++------ pkg/sql/stats/json.go | 22 +++++++ pkg/sql/stats/merge.go | 24 +++---- pkg/sql/stats/merge_test.go | 50 +++++++++------ pkg/sql/stats/stats_cache.go | 43 ++++++++----- 16 files changed, 239 insertions(+), 149 deletions(-) diff --git a/pkg/ccl/backupccl/testdata/backup-restore/metadata b/pkg/ccl/backupccl/testdata/backup-restore/metadata index 2751f22beea3..ea883b877f57 100644 --- a/pkg/ccl/backupccl/testdata/backup-restore/metadata +++ b/pkg/ccl/backupccl/testdata/backup-restore/metadata @@ -53,9 +53,6 @@ exec-sql BACKUP DATABASE db1 INTO 'nodelocal://0/test/' ---- -# TODO(ssd): The expectation here is 6 stats rather than 7 because the -# 'partial' stat is not backed up even though it is persisted. I think -# we may want a different API for fetching statistics. query-sql SELECT json_array_length( @@ -72,4 +69,4 @@ SELECT -> 'statistics' ) ---- -6 +7 diff --git a/pkg/sql/distsql_plan_stats.go b/pkg/sql/distsql_plan_stats.go index 8d1b3d98bb3c..28154e70b0a1 100644 --- a/pkg/sql/distsql_plan_stats.go +++ b/pkg/sql/distsql_plan_stats.go @@ -223,19 +223,22 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan( // column that is not partial and not forecasted. The first one we find will // be the latest due to the newest to oldest ordering property of the cache. for _, t := range tableStats { - // TODO (faizaanmadhani): Ideally, we don't want to verify that - // a statistic is forecasted or merged based on the name because - // someone could create a statistic named __forecast__ or __merged__. - // Update system.table_statistics to add an enum to indicate which - // type of statistic it is. - if len(t.ColumnIDs) == 1 && column.GetID() == t.ColumnIDs[0] && t.PartialPredicate == "" && - t.Name != jobspb.ForecastStatsName && - t.Name != jobspb.MergedStatsName { + if len(t.ColumnIDs) == 1 && column.GetID() == t.ColumnIDs[0] && + !t.IsPartial() && !t.IsMerged() && !t.IsForecast() { if t.HistogramData == nil || t.HistogramData.ColumnType == nil || len(t.Histogram) == 0 { - return nil, pgerror.Newf(pgcode.ObjectNotInPrerequisiteState, "the latest full statistic for column %s has no histogram", column.GetName()) + return nil, pgerror.Newf( + pgcode.ObjectNotInPrerequisiteState, + "the latest full statistic for column %s has no histogram", + column.GetName(), + ) } - if colinfo.ColumnTypeIsInvertedIndexable(column.GetType()) && t.HistogramData.ColumnType.Family() == types.BytesFamily { - return nil, pgerror.Newf(pgcode.ObjectNotInPrerequisiteState, "the latest full statistic histogram for column %s is an inverted index histogram", column.GetName()) + if colinfo.ColumnTypeIsInvertedIndexable(column.GetType()) && + t.HistogramData.ColumnType.Family() == types.BytesFamily { + return nil, pgerror.Newf( + pgcode.ObjectNotInPrerequisiteState, + "the latest full statistic histogram for column %s is an inverted index histogram", + column.GetName(), + ) } stat = t histogram = t.Histogram diff --git a/pkg/sql/opt/cat/table.go b/pkg/sql/opt/cat/table.go index 346963dab37f..6d62525c16d3 100644 --- a/pkg/sql/opt/cat/table.go +++ b/pkg/sql/opt/cat/table.go @@ -228,8 +228,21 @@ type TableStatistic interface { // inverted index histograms, this will always return types.Bytes. HistogramType() *types.T - // IsForecast returns true if this statistic is a forecast. + // IsPartial returns true if this statistic was collected with a where + // clause. (If the where clause was something like "WHERE 1 = 1" or "WHERE + // true" this could technically be a full statistic rather than a partial + // statistic, but this function does not check for this.) + IsPartial() bool + + // IsMerged returns true if this statistic was created by merging a partial + // and a full statistic. + IsMerged() bool + + // IsForecast returns true if this statistic was created by forecasting. IsForecast() bool + + // IsAuto returns true if this statistic was collected automatically. + IsAuto() bool } // HistogramBucket contains the data for a single histogram bucket. Note diff --git a/pkg/sql/opt/exec/execbuilder/relational.go b/pkg/sql/opt/exec/execbuilder/relational.go index cbb969fdb49a..d348b7595f64 100644 --- a/pkg/sql/opt/exec/execbuilder/relational.go +++ b/pkg/sql/opt/exec/execbuilder/relational.go @@ -403,12 +403,12 @@ func (b *Builder) maybeAnnotateWithEstimates(node exec.Node, e memo.RelExpr) { if scan, ok := e.(*memo.ScanExpr); ok { tab := b.mem.Metadata().Table(scan.Table) if tab.StatisticCount() > 0 { - // The first stat is the most recent one. + // The first stat is the most recent full one. var first int - if !b.evalCtx.SessionData().OptimizerUseForecasts { - for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() { - first++ - } + for first < tab.StatisticCount() && + tab.Statistic(first).IsPartial() || + (tab.Statistic(first).IsForecast() && !b.evalCtx.SessionData().OptimizerUseForecasts) { + first++ } if first < tab.StatisticCount() { @@ -422,10 +422,10 @@ func (b *Builder) maybeAnnotateWithEstimates(node exec.Node, e memo.RelExpr) { val.Forecast = stat.IsForecast() if val.Forecast { val.ForecastAt = stat.CreatedAt() - // Find the first non-forecast stat. + // Find the first non-forecast full stat. for i := first + 1; i < tab.StatisticCount(); i++ { nextStat := tab.Statistic(i) - if !nextStat.IsForecast() { + if !nextStat.IsPartial() && !nextStat.IsForecast() { val.TableStatsCreatedAt = nextStat.CreatedAt() break } @@ -760,8 +760,11 @@ func (b *Builder) buildScan(scan *memo.ScanExpr) (execPlan, error) { b.TotalScanRows += stats.RowCount b.ScanCounts[exec.ScanWithStatsCount]++ - // The first stat is the most recent one. Check if it was a forecast. + // The first stat is the most recent full one. Check if it was a forecast. var first int + for first < tab.StatisticCount() && tab.Statistic(first).IsPartial() { + first++ + } if first < tab.StatisticCount() && tab.Statistic(first).IsForecast() { if b.evalCtx.SessionData().OptimizerUseForecasts { b.ScanCounts[exec.ScanWithStatsForecastCount]++ @@ -772,8 +775,9 @@ func (b *Builder) buildScan(scan *memo.ScanExpr) (execPlan, error) { b.NanosSinceStatsForecasted = nanosSinceStatsForecasted } } - // Find the first non-forecast stat. - for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() { + // Find the first non-forecast full stat. + for first < tab.StatisticCount() && + (tab.Statistic(first).IsPartial() || tab.Statistic(first).IsForecast()) { first++ } } diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index e301d3b8f33c..cdf94c2daf88 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -614,12 +614,12 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati // Make now and annotate the metadata table with it for next time. stats = &props.Statistics{} - // Find the most recent statistic. (Stats are ordered with most recent first.) + // Find the most recent full statistic. (Stats are ordered with most recent first.) var first int - if !sb.evalCtx.SessionData().OptimizerUseForecasts { - for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() { - first++ - } + for first < tab.StatisticCount() && + (tab.Statistic(first).IsPartial() || + tab.Statistic(first).IsForecast() && !sb.evalCtx.SessionData().OptimizerUseForecasts) { + first++ } if first >= tab.StatisticCount() { @@ -639,6 +639,9 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati // column set. Stats are ordered with most recent first. for i := first; i < tab.StatisticCount(); i++ { stat := tab.Statistic(i) + if stat.IsPartial() { + continue + } if stat.IsForecast() && !sb.evalCtx.SessionData().OptimizerUseForecasts { continue } diff --git a/pkg/sql/opt/testutils/testcat/BUILD.bazel b/pkg/sql/opt/testutils/testcat/BUILD.bazel index d1424d4c190d..cd3380cd4ed0 100644 --- a/pkg/sql/opt/testutils/testcat/BUILD.bazel +++ b/pkg/sql/opt/testutils/testcat/BUILD.bazel @@ -23,7 +23,6 @@ go_library( deps = [ "//pkg/config/zonepb", "//pkg/geo/geoindex", - "//pkg/jobs/jobspb", "//pkg/roachpb", "//pkg/security/username", "//pkg/settings/cluster", diff --git a/pkg/sql/opt/testutils/testcat/test_catalog.go b/pkg/sql/opt/testutils/testcat/test_catalog.go index e533eca40e10..e3a090c30296 100644 --- a/pkg/sql/opt/testutils/testcat/test_catalog.go +++ b/pkg/sql/opt/testutils/testcat/test_catalog.go @@ -18,7 +18,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/config/zonepb" "github.com/cockroachdb/cockroach/pkg/geo/geoindex" - "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security/username" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -1294,9 +1293,24 @@ func (ts *TableStat) HistogramType() *types.T { return ts.histogramType } +// IsPartial is part of the cat.TableStatistic interface. +func (ts *TableStat) IsPartial() bool { + return ts.js.IsPartial() +} + +// IsMerged is part of the cat.TableStatistic interface. +func (ts *TableStat) IsMerged() bool { + return ts.js.IsMerged() +} + // IsForecast is part of the cat.TableStatistic interface. func (ts *TableStat) IsForecast() bool { - return ts.js.Name == jobspb.ForecastStatsName + return ts.js.IsForecast() +} + +// IsAuto is part of the cat.TableStatistic interface. +func (ts *TableStat) IsAuto() bool { + return ts.js.IsAuto() } // TableStats is a slice of TableStat pointers. diff --git a/pkg/sql/opt_catalog.go b/pkg/sql/opt_catalog.go index d5c98613e7fc..271b3a5b7627 100644 --- a/pkg/sql/opt_catalog.go +++ b/pkg/sql/opt_catalog.go @@ -17,7 +17,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/geo/geoindex" - "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -1773,9 +1772,24 @@ func (os *optTableStat) HistogramType() *types.T { return os.stat.HistogramData.ColumnType } +// IsPartial is part of the cat.TableStatistic interface. +func (os *optTableStat) IsPartial() bool { + return os.stat.IsPartial() +} + +// IsMerged is part of the cat.TableStatistic interface. +func (os *optTableStat) IsMerged() bool { + return os.stat.IsMerged() +} + // IsForecast is part of the cat.TableStatistic interface. func (os *optTableStat) IsForecast() bool { - return os.stat.Name == jobspb.ForecastStatsName + return os.stat.IsForecast() +} + +// IsAuto is part of the cat.TableStatistic interface. +func (os *optTableStat) IsAuto() bool { + return os.stat.IsAuto() } // optFamily is a wrapper around descpb.ColumnFamilyDescriptor that keeps a diff --git a/pkg/sql/show_stats.go b/pkg/sql/show_stats.go index d071a35aac2b..b45c949704fd 100644 --- a/pkg/sql/show_stats.go +++ b/pkg/sql/show_stats.go @@ -14,7 +14,6 @@ import ( "context" encjson "encoding/json" "fmt" - "sort" "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -194,10 +193,8 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p _, withMerge := opts[showTableStatsOptMerge] _, withForecast := opts[showTableStatsOptForecast] - - obsFullStats := make([]*stats.TableStatistic, 0, len(rows)) - obsPartialStats := make([]*stats.TableStatistic, 0, len(rows)) if withMerge || withForecast { + statsList := make([]*stats.TableStatistic, 0, len(rows)) for _, row := range rows { // Skip stats on dropped columns. colIDs := row[columnIDsIdx].(*tree.DArray).Array @@ -215,48 +212,38 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p return nil, err } } - if obs.PartialPredicate != "" { - obsPartialStats = append(obsPartialStats, obs) - } else { - obsFullStats = append(obsFullStats, obs) - } + statsList = append(statsList, obs) } - // Reverse the lists to sort by CreatedAt descending. - for i := 0; i < len(obsFullStats)/2; i++ { - j := len(obsFullStats) - i - 1 - obsFullStats[i], obsFullStats[j] = obsFullStats[j], obsFullStats[i] - } - for i := 0; i < len(obsPartialStats)/2; i++ { - j := len(obsPartialStats) - i - 1 - obsPartialStats[i], obsPartialStats[j] = obsPartialStats[j], obsPartialStats[i] + // Reverse the list to sort by CreatedAt descending. + for i := 0; i < len(statsList)/2; i++ { + j := len(statsList) - i - 1 + statsList[i], statsList[j] = statsList[j], statsList[i] } - } - if withMerge { - merged := stats.MergedStatistics(ctx, obsPartialStats, obsFullStats) - obsFullStats = append(obsFullStats, merged...) - sort.Slice(obsFullStats, func(i, j int) bool { - return obsFullStats[i].CreatedAt.After(obsFullStats[j].CreatedAt) - }) - for i := len(merged) - 1; i >= 0; i-- { - mergedRow, err := tableStatisticProtoToRow(&merged[i].TableStatisticProto, partialStatsVerActive) - if err != nil { - return nil, err + if withMerge { + merged := stats.MergedStatistics(ctx, statsList) + statsList = append(merged, statsList...) + // Iterate in reverse order to match the ORDER BY "columnIDs". + for i := len(merged) - 1; i >= 0; i-- { + mergedRow, err := tableStatisticProtoToRow(&merged[i].TableStatisticProto, partialStatsVerActive) + if err != nil { + return nil, err + } + rows = append(rows, mergedRow) } - rows = append(rows, mergedRow) } - } - if withForecast { - forecasts := stats.ForecastTableStatistics(ctx, obsFullStats) - // Iterate in reverse order to match the ORDER BY "columnIDs". - for i := len(forecasts) - 1; i >= 0; i-- { - forecastRow, err := tableStatisticProtoToRow(&forecasts[i].TableStatisticProto, partialStatsVerActive) - if err != nil { - return nil, err + if withForecast { + forecasts := stats.ForecastTableStatistics(ctx, statsList) + // Iterate in reverse order to match the ORDER BY "columnIDs". + for i := len(forecasts) - 1; i >= 0; i-- { + forecastRow, err := tableStatisticProtoToRow(&forecasts[i].TableStatisticProto, partialStatsVerActive) + if err != nil { + return nil, err + } + rows = append(rows, forecastRow) } - rows = append(rows, forecastRow) } } diff --git a/pkg/sql/stats/automatic_stats.go b/pkg/sql/stats/automatic_stats.go index 43056b5c32fd..2de9a1709c1a 100644 --- a/pkg/sql/stats/automatic_stats.go +++ b/pkg/sql/stats/automatic_stats.go @@ -706,7 +706,7 @@ func (r *Refresher) maybeRefreshStats( // refresh happens at exactly 2x the current average, and the average // refresh time is calculated from the most recent 4 refreshes. See the // comment in stats/delete_stats.go.) - maxTimeBetweenRefreshes := stat.CreatedAt.Add(2*avgRefreshTime(tableStats) + r.extraTime) + maxTimeBetweenRefreshes := stat.CreatedAt.Add(2*avgFullRefreshTime(tableStats) + r.extraTime) if timeutil.Now().After(maxTimeBetweenRefreshes) { mustRefresh = true } @@ -789,21 +789,20 @@ func mostRecentAutomaticStat(tableStats []*TableStatistic) *TableStatistic { return nil } -// avgRefreshTime returns the average time between automatic statistics +// avgFullRefreshTime returns the average time between automatic full statistics // refreshes given a list of tableStats from one table. It does so by finding -// the most recent automatically generated statistic (identified by the name -// AutoStatsName), and then finds all previously generated automatic stats on -// those same columns. The average is calculated as the average time between -// each consecutive stat. +// the most recent automatically generated statistic and then finds all +// previously generated automatic stats on those same columns. The average is +// calculated as the average time between each consecutive stat. // // If there are not at least two automatically generated statistics on the same // columns, the default value defaultAverageTimeBetweenRefreshes is returned. -func avgRefreshTime(tableStats []*TableStatistic) time.Duration { +func avgFullRefreshTime(tableStats []*TableStatistic) time.Duration { var reference *TableStatistic var sum time.Duration var count int for _, stat := range tableStats { - if stat.Name != jobspb.AutoStatsName { + if !stat.IsAuto() || stat.IsPartial() { continue } if reference == nil { diff --git a/pkg/sql/stats/automatic_stats_test.go b/pkg/sql/stats/automatic_stats_test.go index 1c06aced3a9b..46b758595f89 100644 --- a/pkg/sql/stats/automatic_stats_test.go +++ b/pkg/sql/stats/automatic_stats_test.go @@ -316,8 +316,8 @@ func TestAverageRefreshTime(t *testing.T) { if err != nil { return err } - if actual := avgRefreshTime(stats).Round(time.Minute); actual != expected { - return fmt.Errorf("expected avgRefreshTime %s but found %s", + if actual := avgFullRefreshTime(stats).Round(time.Minute); actual != expected { + return fmt.Errorf("expected avgFullRefreshTime %s but found %s", expected.String(), actual.String()) } return nil @@ -350,7 +350,7 @@ func TestAverageRefreshTime(t *testing.T) { }) } - // Since there are no stats yet, avgRefreshTime should return the default + // Since there are no stats yet, avgFullRefreshTime should return the default // value. if err := checkAverageRefreshTime(defaultAverageTimeBetweenRefreshes); err != nil { t.Fatal(err) @@ -410,7 +410,7 @@ func TestAverageRefreshTime(t *testing.T) { t.Fatal(err) } - // None of the stats have the name AutoStatsName, so avgRefreshTime + // None of the stats have the name AutoStatsName, so avgFullRefreshTime // should still return the default value. if err := checkAverageRefreshTime(defaultAverageTimeBetweenRefreshes); err != nil { t.Fatal(err) diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go index 0089171995af..868da3e2e706 100644 --- a/pkg/sql/stats/forecast.go +++ b/pkg/sql/stats/forecast.go @@ -48,7 +48,7 @@ const minGoodnessOfFit = 0.95 // the given observed statistics. The observed statistics must be ordered by // collection time descending, with the latest observed statistics first. The // observed statistics may be a mixture of statistics for different sets of -// columns. +// columns. Partial statistics will be skipped. // // Whether a forecast is produced for a set of columns depends on how well the // observed statistics for that set of columns fit a linear regression model. @@ -64,23 +64,15 @@ const minGoodnessOfFit = 0.95 // ForecastTableStatistics is deterministic: given the same observations it will // return the same forecasts. func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) []*TableStatistic { - // Early sanity check. We'll check this again in forecastColumnStatistics. - if len(observed) < minObservationsForForecast { - return nil - } - - // To make forecasts deterministic, we must choose a time to forecast at based - // on only the observed statistics. We choose the time of the latest - // statistics + the average time between automatic stats collections, which - // should be roughly when the next automatic stats collection will occur. - latest := observed[0].CreatedAt - at := latest.Add(avgRefreshTime(observed)) - - // Group observed statistics by column set, and remove statistics with - // inverted histograms. + // Group observed statistics by column set, skipping over partial statistics + // and statistics with inverted histograms. + var latest time.Time var forecastCols []string observedByCols := make(map[string][]*TableStatistic) for _, stat := range observed { + if stat.IsPartial() { + continue + } // We don't have a good way to detect inverted statistics right now, so skip // all statistics with histograms of type BYTES. This means we cannot // forecast statistics for normal BYTES columns. @@ -89,6 +81,9 @@ func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) [] stat.HistogramData.ColumnType.Family() == types.BytesFamily { continue } + if latest.IsZero() { + latest = stat.CreatedAt + } colKey := MakeSortedColStatKey(stat.ColumnIDs) obs, ok := observedByCols[colKey] if !ok { @@ -97,6 +92,16 @@ func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) [] observedByCols[colKey] = append(obs, stat) } + // To make forecasts deterministic, we must choose a time to forecast at based + // on only the observed statistics. We choose the time of the latest full + // statistics + the average time between automatic stats collections, which + // should be roughly when the next automatic stats collection will occur. + if latest.IsZero() { + // No suitable stats. + return nil + } + at := latest.Add(avgFullRefreshTime(observed)) + forecasts := make([]*TableStatistic, 0, len(forecastCols)) for _, colKey := range forecastCols { forecast, err := forecastColumnStatistics(ctx, observedByCols[colKey], at, minGoodnessOfFit) @@ -114,10 +119,11 @@ func ForecastTableStatistics(ctx context.Context, observed []*TableStatistic) [] // forecastColumnStatistics produces a statistics forecast at the given time, // based on the given observed statistics. The observed statistics must all be -// for the same set of columns, must not contain any inverted histograms, must -// have a single observation per collection time, and must be ordered by -// collection time descending with the latest observed statistics first. The -// given time to forecast at can be in the past, present, or future. +// for the same set of columns, must all be full statistics, must not contain +// any inverted histograms, must have a single observation per collection time, +// and must be ordered by collection time descending with the latest observed +// statistics first. The given time to forecast at can be in the past, present, +// or future. // // To create a forecast, we construct a linear regression model over time for // each statistic (row count, null count, distinct count, average row size, and diff --git a/pkg/sql/stats/json.go b/pkg/sql/stats/json.go index 09f8d0c9ab49..0959234add36 100644 --- a/pkg/sql/stats/json.go +++ b/pkg/sql/stats/json.go @@ -14,6 +14,7 @@ import ( "context" "fmt" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/sql/parser" "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside" @@ -163,3 +164,24 @@ func (js *JSONStatistic) GetHistogram( } return h, nil } + +// IsPartial returns true if this statistic was collected with a where clause. +func (js *JSONStatistic) IsPartial() bool { + return js.PartialPredicate != "" +} + +// IsMerged returns true if this statistic was created by merging a partial and +// a full statistic. +func (js *JSONStatistic) IsMerged() bool { + return js.Name == jobspb.MergedStatsName +} + +// IsForecast returns true if this statistic was created by forecasting. +func (js *JSONStatistic) IsForecast() bool { + return js.Name == jobspb.ForecastStatsName +} + +// IsAuto returns true if this statistic was collected automatically. +func (js *JSONStatistic) IsAuto() bool { + return js.Name == jobspb.AutoStatsName +} diff --git a/pkg/sql/stats/merge.go b/pkg/sql/stats/merge.go index 6dced9c85868..678a8630c513 100644 --- a/pkg/sql/stats/merge.go +++ b/pkg/sql/stats/merge.go @@ -28,27 +28,29 @@ import ( // are the merged combinations of the latest full statistic and the latest partial // statistic for a specific column set. If merging a partial stat with the full // stat is not possible, we don't include that statistic as part of the resulting array. -func MergedStatistics( - ctx context.Context, partialStatsList []*TableStatistic, fullStatsList []*TableStatistic, -) []*TableStatistic { +func MergedStatistics(ctx context.Context, stats []*TableStatistic) []*TableStatistic { // Map the ColumnIDs to the latest full table statistic, // and map the keys to the number of columns in the set. // It relies on the fact that the first full statistic // is the latest. fullStatsMap := make(map[descpb.ColumnID]*TableStatistic) - for _, stat := range fullStatsList { - if len(stat.ColumnIDs) == 1 { - col := stat.ColumnIDs[0] - _, ok := fullStatsMap[col] - if !ok { - fullStatsMap[col] = stat - } + for _, stat := range stats { + if stat.IsPartial() || len(stat.ColumnIDs) != 1 { + continue + } + col := stat.ColumnIDs[0] + _, ok := fullStatsMap[col] + if !ok { + fullStatsMap[col] = stat } } mergedStats := make([]*TableStatistic, 0, len(fullStatsMap)) var seenCols intsets.Fast - for _, partialStat := range partialStatsList { + for _, partialStat := range stats { + if !partialStat.IsPartial() || len(partialStat.ColumnIDs) != 1 { + continue + } col := partialStat.ColumnIDs[0] if !seenCols.Contains(int(col)) { seenCols.Add(int(col)) diff --git a/pkg/sql/stats/merge_test.go b/pkg/sql/stats/merge_test.go index 4961b9b4efde..12bcadaa106c 100644 --- a/pkg/sql/stats/merge_test.go +++ b/pkg/sql/stats/merge_test.go @@ -12,7 +12,9 @@ package stats import ( "context" + "fmt" "reflect" + "sort" "strconv" "testing" @@ -342,31 +344,41 @@ func TestMergedStatistics(t *testing.T) { } for i, tc := range testCases { t.Run(strconv.Itoa(i), func(t *testing.T) { - fullStatList := make([]*TableStatistic, 0, len(tc.full)) - partialStatList := make([]*TableStatistic, 0, len(tc.full)) - expected := make([]*TableStatistic, 0, len(tc.expected)) + statsList := make([]*TableStatistic, 0, len(tc.full)+len(tc.partial)) + predicates := make(map[uint64]string, len(tc.full)) for _, s := range tc.full { - fullStatList = append(fullStatList, s.toTableStatistic("full", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, uint64(s.colID) /* statID */, 0 /* fullStatID */)) + stats := s.toTableStatistic( + "full", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, uint64(s.colID), /* statID */ + 0, /* fullStatID */ + ) + statsList = append(statsList, stats) + predicates[uint64(s.colID)] = fmt.Sprintf( + "(c IS NULL) OR ((c < %v:::FLOAT8) OR (c > %v:::FLOAT8))", + s.hist[0].UpperBound, s.hist[len(s.hist)-1].UpperBound, + ) } for _, s := range tc.partial { - partialStatList = append(partialStatList, s.toTableStatistic("partial", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, 0 /* statID */, uint64(s.colID) /* fullStatID */)) + stats := s.toTableStatistic( + "partial", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, 0, /* statID */ + uint64(s.colID), /* fullStatID */ + ) + stats.PartialPredicate = predicates[uint64(s.colID)] + statsList = append(statsList, stats) } + sort.Slice(statsList, func(i, j int) bool { + return statsList[i].CreatedAt.After(statsList[j].CreatedAt) + }) + expected := make([]*TableStatistic, 0, len(tc.expected)) for _, s := range tc.expected { - expected = append(expected, s.toTableStatistic("__merged__", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, 0 /* statID */, 0 /* fullStatID */)) - } - observed := MergedStatistics(ctx, partialStatList, fullStatList) - observedStats := make(map[descpb.ColumnID]*TableStatistic) - for _, obs := range observed { - observedStats[obs.ColumnIDs[0]] = obs + stats := s.toTableStatistic( + "__merged__", i, descpb.ColumnIDs{descpb.ColumnID(s.colID)}, 0, /* statID */ + 0, /* fullStatID */ + ) + expected = append(expected, stats) } - for _, exp := range expected { - if obs, ok := observedStats[exp.ColumnIDs[0]]; ok { - if !reflect.DeepEqual(obs, exp) { - t.Errorf("test case %d incorrect merge\n%s\nexpected\n%s", i, obs, exp) - } - } else { - t.Errorf("test case %d, expected stats for %d not produced", i, exp.ColumnIDs[0]) - } + merged := MergedStatistics(ctx, statsList) + if !reflect.DeepEqual(merged, expected) { + t.Errorf("test case %d incorrect, merged:\n%s\nexpected:\n%s", i, merged, expected) } }) } diff --git a/pkg/sql/stats/stats_cache.go b/pkg/sql/stats/stats_cache.go index 4c7cd8ea6224..2d6f73d4d681 100644 --- a/pkg/sql/stats/stats_cache.go +++ b/pkg/sql/stats/stats_cache.go @@ -17,6 +17,7 @@ import ( "sync" "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" "github.com/cockroachdb/cockroach/pkg/roachpb" @@ -732,6 +733,27 @@ func (tabStat *TableStatistic) String() string { ) } +// IsPartial returns true if this statistic was collected with a where clause. +func (tsp *TableStatisticProto) IsPartial() bool { + return tsp.PartialPredicate != "" +} + +// IsMerged returns true if this statistic was created by merging a partial and +// a full statistic. +func (tsp *TableStatisticProto) IsMerged() bool { + return tsp.Name == jobspb.MergedStatsName +} + +// IsForecast returns true if this statistic was created by forecasting. +func (tsp *TableStatisticProto) IsForecast() bool { + return tsp.Name == jobspb.ForecastStatsName +} + +// IsAuto returns true if this statistic was collected automatically. +func (tsp *TableStatisticProto) IsAuto() bool { + return tsp.Name == jobspb.AutoStatsName +} + // getTableStatsFromDB retrieves the statistics in system.table_statistics // for the given table ID. // @@ -778,8 +800,7 @@ ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC return nil, err } - var fullStatsList []*TableStatistic - var partialStatsList []*TableStatistic + var statsList []*TableStatistic var ok bool for ok, err = it.Next(ctx); ok; ok, err = it.Next(ctx) { stats, err := sc.parseStats(ctx, it.Cur(), partialStatisticsColumnsVerActive) @@ -787,11 +808,7 @@ ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC log.Warningf(ctx, "could not decode statistic for table %d: %v", tableID, err) continue } - if stats.PartialPredicate != "" { - partialStatsList = append(partialStatsList, stats) - } else { - fullStatsList = append(fullStatsList, stats) - } + statsList = append(statsList, stats) } if err != nil { return nil, err @@ -799,15 +816,13 @@ ORDER BY "createdAt" DESC, "columnIDs" DESC, "statisticID" DESC // TODO(faizaanmadhani): Wrap merging behind a boolean so // that it can be turned off. - if len(partialStatsList) > 0 { - mergedStats := MergedStatistics(ctx, partialStatsList, fullStatsList) - fullStatsList = append(mergedStats, fullStatsList...) - } + merged := MergedStatistics(ctx, statsList) + statsList = append(merged, statsList...) if forecast { - forecasts := ForecastTableStatistics(ctx, fullStatsList) - fullStatsList = append(forecasts, fullStatsList...) + forecasts := ForecastTableStatistics(ctx, statsList) + statsList = append(forecasts, statsList...) } - return fullStatsList, nil + return statsList, nil }