Skip to content

Commit

Permalink
sql/stats: include partial stats in results of statsCache.GetTableStats
Browse files Browse the repository at this point in the history
We were not including partial stats in the list of table statistics
returned by `statsCache.GetTableStats`. This was fine for the optimizer,
which currently cannot use partial stats directly, but it was a problem
for backup.

We'd like to use partial stats directly in the optimizer eventually, so
this commit goes ahead and adds them to the results of `GetTableStats`.
The optimizer then must filter them out. To streamline this we add some
helper functions.

Finally, in an act of overzealous refactoring, this commit also changes
`MergedStatistics` and `ForecastTableStatistics` to accept partial
statistics and full statistics mixed together in the same input list.
This simplifies the code that calls these functions.

Fixes: #95056
Part of: #93983

Epic: CRDB-19449

Release note: None
  • Loading branch information
michae2 committed Jan 19, 2023
1 parent f5cac1e commit 3bc0992
Show file tree
Hide file tree
Showing 16 changed files with 239 additions and 149 deletions.
5 changes: 1 addition & 4 deletions pkg/ccl/backupccl/testdata/backup-restore/metadata
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ exec-sql
BACKUP DATABASE db1 INTO 'nodelocal://0/test/'
----

# TODO(ssd): The expectation here is 6 stats rather than 7 because the
# 'partial' stat is not backed up even though it is persisted. I think
# we may want a different API for fetching statistics.
query-sql
SELECT
json_array_length(
Expand All @@ -72,4 +69,4 @@ SELECT
-> 'statistics'
)
----
6
7
25 changes: 14 additions & 11 deletions pkg/sql/distsql_plan_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,19 +223,22 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan(
// column that is not partial and not forecasted. The first one we find will
// be the latest due to the newest to oldest ordering property of the cache.
for _, t := range tableStats {
// TODO (faizaanmadhani): Ideally, we don't want to verify that
// a statistic is forecasted or merged based on the name because
// someone could create a statistic named __forecast__ or __merged__.
// Update system.table_statistics to add an enum to indicate which
// type of statistic it is.
if len(t.ColumnIDs) == 1 && column.GetID() == t.ColumnIDs[0] && t.PartialPredicate == "" &&
t.Name != jobspb.ForecastStatsName &&
t.Name != jobspb.MergedStatsName {
if len(t.ColumnIDs) == 1 && column.GetID() == t.ColumnIDs[0] &&
!t.IsPartial() && !t.IsMerged() && !t.IsForecast() {
if t.HistogramData == nil || t.HistogramData.ColumnType == nil || len(t.Histogram) == 0 {
return nil, pgerror.Newf(pgcode.ObjectNotInPrerequisiteState, "the latest full statistic for column %s has no histogram", column.GetName())
return nil, pgerror.Newf(
pgcode.ObjectNotInPrerequisiteState,
"the latest full statistic for column %s has no histogram",
column.GetName(),
)
}
if colinfo.ColumnTypeIsInvertedIndexable(column.GetType()) && t.HistogramData.ColumnType.Family() == types.BytesFamily {
return nil, pgerror.Newf(pgcode.ObjectNotInPrerequisiteState, "the latest full statistic histogram for column %s is an inverted index histogram", column.GetName())
if colinfo.ColumnTypeIsInvertedIndexable(column.GetType()) &&
t.HistogramData.ColumnType.Family() == types.BytesFamily {
return nil, pgerror.Newf(
pgcode.ObjectNotInPrerequisiteState,
"the latest full statistic histogram for column %s is an inverted index histogram",
column.GetName(),
)
}
stat = t
histogram = t.Histogram
Expand Down
15 changes: 14 additions & 1 deletion pkg/sql/opt/cat/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,21 @@ type TableStatistic interface {
// inverted index histograms, this will always return types.Bytes.
HistogramType() *types.T

// IsForecast returns true if this statistic is a forecast.
// IsPartial returns true if this statistic was collected with a where
// clause. (If the where clause was something like "WHERE 1 = 1" or "WHERE
// true" this could technically be a full statistic rather than a partial
// statistic, but this function does not check for this.)
IsPartial() bool

// IsMerged returns true if this statistic was created by merging a partial
// and a full statistic.
IsMerged() bool

// IsForecast returns true if this statistic was created by forecasting.
IsForecast() bool

// IsAuto returns true if this statistic was collected automatically.
IsAuto() bool
}

// HistogramBucket contains the data for a single histogram bucket. Note
Expand Down
24 changes: 14 additions & 10 deletions pkg/sql/opt/exec/execbuilder/relational.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,12 @@ func (b *Builder) maybeAnnotateWithEstimates(node exec.Node, e memo.RelExpr) {
if scan, ok := e.(*memo.ScanExpr); ok {
tab := b.mem.Metadata().Table(scan.Table)
if tab.StatisticCount() > 0 {
// The first stat is the most recent one.
// The first stat is the most recent full one.
var first int
if !b.evalCtx.SessionData().OptimizerUseForecasts {
for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() {
first++
}
for first < tab.StatisticCount() &&
tab.Statistic(first).IsPartial() ||
(tab.Statistic(first).IsForecast() && !b.evalCtx.SessionData().OptimizerUseForecasts) {
first++
}

if first < tab.StatisticCount() {
Expand All @@ -422,10 +422,10 @@ func (b *Builder) maybeAnnotateWithEstimates(node exec.Node, e memo.RelExpr) {
val.Forecast = stat.IsForecast()
if val.Forecast {
val.ForecastAt = stat.CreatedAt()
// Find the first non-forecast stat.
// Find the first non-forecast full stat.
for i := first + 1; i < tab.StatisticCount(); i++ {
nextStat := tab.Statistic(i)
if !nextStat.IsForecast() {
if !nextStat.IsPartial() && !nextStat.IsForecast() {
val.TableStatsCreatedAt = nextStat.CreatedAt()
break
}
Expand Down Expand Up @@ -760,8 +760,11 @@ func (b *Builder) buildScan(scan *memo.ScanExpr) (execPlan, error) {
b.TotalScanRows += stats.RowCount
b.ScanCounts[exec.ScanWithStatsCount]++

// The first stat is the most recent one. Check if it was a forecast.
// The first stat is the most recent full one. Check if it was a forecast.
var first int
for first < tab.StatisticCount() && tab.Statistic(first).IsPartial() {
first++
}
if first < tab.StatisticCount() && tab.Statistic(first).IsForecast() {
if b.evalCtx.SessionData().OptimizerUseForecasts {
b.ScanCounts[exec.ScanWithStatsForecastCount]++
Expand All @@ -772,8 +775,9 @@ func (b *Builder) buildScan(scan *memo.ScanExpr) (execPlan, error) {
b.NanosSinceStatsForecasted = nanosSinceStatsForecasted
}
}
// Find the first non-forecast stat.
for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() {
// Find the first non-forecast full stat.
for first < tab.StatisticCount() &&
(tab.Statistic(first).IsPartial() || tab.Statistic(first).IsForecast()) {
first++
}
}
Expand Down
13 changes: 8 additions & 5 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,12 +614,12 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
// Make now and annotate the metadata table with it for next time.
stats = &props.Statistics{}

// Find the most recent statistic. (Stats are ordered with most recent first.)
// Find the most recent full statistic. (Stats are ordered with most recent first.)
var first int
if !sb.evalCtx.SessionData().OptimizerUseForecasts {
for first < tab.StatisticCount() && tab.Statistic(first).IsForecast() {
first++
}
for first < tab.StatisticCount() &&
(tab.Statistic(first).IsPartial() ||
tab.Statistic(first).IsForecast() && !sb.evalCtx.SessionData().OptimizerUseForecasts) {
first++
}

if first >= tab.StatisticCount() {
Expand All @@ -639,6 +639,9 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
// column set. Stats are ordered with most recent first.
for i := first; i < tab.StatisticCount(); i++ {
stat := tab.Statistic(i)
if stat.IsPartial() {
continue
}
if stat.IsForecast() && !sb.evalCtx.SessionData().OptimizerUseForecasts {
continue
}
Expand Down
1 change: 0 additions & 1 deletion pkg/sql/opt/testutils/testcat/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ go_library(
deps = [
"//pkg/config/zonepb",
"//pkg/geo/geoindex",
"//pkg/jobs/jobspb",
"//pkg/roachpb",
"//pkg/security/username",
"//pkg/settings/cluster",
Expand Down
18 changes: 16 additions & 2 deletions pkg/sql/opt/testutils/testcat/test_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (

"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
Expand Down Expand Up @@ -1294,9 +1293,24 @@ func (ts *TableStat) HistogramType() *types.T {
return ts.histogramType
}

// IsPartial is part of the cat.TableStatistic interface.
func (ts *TableStat) IsPartial() bool {
return ts.js.IsPartial()
}

// IsMerged is part of the cat.TableStatistic interface.
func (ts *TableStat) IsMerged() bool {
return ts.js.IsMerged()
}

// IsForecast is part of the cat.TableStatistic interface.
func (ts *TableStat) IsForecast() bool {
return ts.js.Name == jobspb.ForecastStatsName
return ts.js.IsForecast()
}

// IsAuto is part of the cat.TableStatistic interface.
func (ts *TableStat) IsAuto() bool {
return ts.js.IsAuto()
}

// TableStats is a slice of TableStat pointers.
Expand Down
18 changes: 16 additions & 2 deletions pkg/sql/opt_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (

"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/geo/geoindex"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
Expand Down Expand Up @@ -1773,9 +1772,24 @@ func (os *optTableStat) HistogramType() *types.T {
return os.stat.HistogramData.ColumnType
}

// IsPartial is part of the cat.TableStatistic interface.
func (os *optTableStat) IsPartial() bool {
return os.stat.IsPartial()
}

// IsMerged is part of the cat.TableStatistic interface.
func (os *optTableStat) IsMerged() bool {
return os.stat.IsMerged()
}

// IsForecast is part of the cat.TableStatistic interface.
func (os *optTableStat) IsForecast() bool {
return os.stat.Name == jobspb.ForecastStatsName
return os.stat.IsForecast()
}

// IsAuto is part of the cat.TableStatistic interface.
func (os *optTableStat) IsAuto() bool {
return os.stat.IsAuto()
}

// optFamily is a wrapper around descpb.ColumnFamilyDescriptor that keeps a
Expand Down
63 changes: 25 additions & 38 deletions pkg/sql/show_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"context"
encjson "encoding/json"
"fmt"
"sort"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
Expand Down Expand Up @@ -194,10 +193,8 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p

_, withMerge := opts[showTableStatsOptMerge]
_, withForecast := opts[showTableStatsOptForecast]

obsFullStats := make([]*stats.TableStatistic, 0, len(rows))
obsPartialStats := make([]*stats.TableStatistic, 0, len(rows))
if withMerge || withForecast {
statsList := make([]*stats.TableStatistic, 0, len(rows))
for _, row := range rows {
// Skip stats on dropped columns.
colIDs := row[columnIDsIdx].(*tree.DArray).Array
Expand All @@ -215,48 +212,38 @@ func (p *planner) ShowTableStats(ctx context.Context, n *tree.ShowTableStats) (p
return nil, err
}
}
if obs.PartialPredicate != "" {
obsPartialStats = append(obsPartialStats, obs)
} else {
obsFullStats = append(obsFullStats, obs)
}
statsList = append(statsList, obs)
}

// Reverse the lists to sort by CreatedAt descending.
for i := 0; i < len(obsFullStats)/2; i++ {
j := len(obsFullStats) - i - 1
obsFullStats[i], obsFullStats[j] = obsFullStats[j], obsFullStats[i]
}
for i := 0; i < len(obsPartialStats)/2; i++ {
j := len(obsPartialStats) - i - 1
obsPartialStats[i], obsPartialStats[j] = obsPartialStats[j], obsPartialStats[i]
// Reverse the list to sort by CreatedAt descending.
for i := 0; i < len(statsList)/2; i++ {
j := len(statsList) - i - 1
statsList[i], statsList[j] = statsList[j], statsList[i]
}
}

if withMerge {
merged := stats.MergedStatistics(ctx, obsPartialStats, obsFullStats)
obsFullStats = append(obsFullStats, merged...)
sort.Slice(obsFullStats, func(i, j int) bool {
return obsFullStats[i].CreatedAt.After(obsFullStats[j].CreatedAt)
})
for i := len(merged) - 1; i >= 0; i-- {
mergedRow, err := tableStatisticProtoToRow(&merged[i].TableStatisticProto, partialStatsVerActive)
if err != nil {
return nil, err
if withMerge {
merged := stats.MergedStatistics(ctx, statsList)
statsList = append(merged, statsList...)
// Iterate in reverse order to match the ORDER BY "columnIDs".
for i := len(merged) - 1; i >= 0; i-- {
mergedRow, err := tableStatisticProtoToRow(&merged[i].TableStatisticProto, partialStatsVerActive)
if err != nil {
return nil, err
}
rows = append(rows, mergedRow)
}
rows = append(rows, mergedRow)
}
}

if withForecast {
forecasts := stats.ForecastTableStatistics(ctx, obsFullStats)
// Iterate in reverse order to match the ORDER BY "columnIDs".
for i := len(forecasts) - 1; i >= 0; i-- {
forecastRow, err := tableStatisticProtoToRow(&forecasts[i].TableStatisticProto, partialStatsVerActive)
if err != nil {
return nil, err
if withForecast {
forecasts := stats.ForecastTableStatistics(ctx, statsList)
// Iterate in reverse order to match the ORDER BY "columnIDs".
for i := len(forecasts) - 1; i >= 0; i-- {
forecastRow, err := tableStatisticProtoToRow(&forecasts[i].TableStatisticProto, partialStatsVerActive)
if err != nil {
return nil, err
}
rows = append(rows, forecastRow)
}
rows = append(rows, forecastRow)
}
}

Expand Down
15 changes: 7 additions & 8 deletions pkg/sql/stats/automatic_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ func (r *Refresher) maybeRefreshStats(
// refresh happens at exactly 2x the current average, and the average
// refresh time is calculated from the most recent 4 refreshes. See the
// comment in stats/delete_stats.go.)
maxTimeBetweenRefreshes := stat.CreatedAt.Add(2*avgRefreshTime(tableStats) + r.extraTime)
maxTimeBetweenRefreshes := stat.CreatedAt.Add(2*avgFullRefreshTime(tableStats) + r.extraTime)
if timeutil.Now().After(maxTimeBetweenRefreshes) {
mustRefresh = true
}
Expand Down Expand Up @@ -789,21 +789,20 @@ func mostRecentAutomaticStat(tableStats []*TableStatistic) *TableStatistic {
return nil
}

// avgRefreshTime returns the average time between automatic statistics
// avgFullRefreshTime returns the average time between automatic full statistics
// refreshes given a list of tableStats from one table. It does so by finding
// the most recent automatically generated statistic (identified by the name
// AutoStatsName), and then finds all previously generated automatic stats on
// those same columns. The average is calculated as the average time between
// each consecutive stat.
// the most recent automatically generated statistic and then finds all
// previously generated automatic stats on those same columns. The average is
// calculated as the average time between each consecutive stat.
//
// If there are not at least two automatically generated statistics on the same
// columns, the default value defaultAverageTimeBetweenRefreshes is returned.
func avgRefreshTime(tableStats []*TableStatistic) time.Duration {
func avgFullRefreshTime(tableStats []*TableStatistic) time.Duration {
var reference *TableStatistic
var sum time.Duration
var count int
for _, stat := range tableStats {
if stat.Name != jobspb.AutoStatsName {
if !stat.IsAuto() || stat.IsPartial() {
continue
}
if reference == nil {
Expand Down
8 changes: 4 additions & 4 deletions pkg/sql/stats/automatic_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,8 @@ func TestAverageRefreshTime(t *testing.T) {
if err != nil {
return err
}
if actual := avgRefreshTime(stats).Round(time.Minute); actual != expected {
return fmt.Errorf("expected avgRefreshTime %s but found %s",
if actual := avgFullRefreshTime(stats).Round(time.Minute); actual != expected {
return fmt.Errorf("expected avgFullRefreshTime %s but found %s",
expected.String(), actual.String())
}
return nil
Expand Down Expand Up @@ -350,7 +350,7 @@ func TestAverageRefreshTime(t *testing.T) {
})
}

// Since there are no stats yet, avgRefreshTime should return the default
// Since there are no stats yet, avgFullRefreshTime should return the default
// value.
if err := checkAverageRefreshTime(defaultAverageTimeBetweenRefreshes); err != nil {
t.Fatal(err)
Expand Down Expand Up @@ -410,7 +410,7 @@ func TestAverageRefreshTime(t *testing.T) {
t.Fatal(err)
}

// None of the stats have the name AutoStatsName, so avgRefreshTime
// None of the stats have the name AutoStatsName, so avgFullRefreshTime
// should still return the default value.
if err := checkAverageRefreshTime(defaultAverageTimeBetweenRefreshes); err != nil {
t.Fatal(err)
Expand Down
Loading

0 comments on commit 3bc0992

Please sign in to comment.