Skip to content

Commit

Permalink
sql/stats: turn forecasting constants into cluster settings
Browse files Browse the repository at this point in the history
Turn three constants used for statistics forecasting into cluster
settings. (These must be cluster settings rather than session variables
because forecasting happens when reloading table statistics in the stats
cache, which can be independent of any session.)

Informs: #119967

Release note (sql change): Introduce three new cluster settings for
controlling table statistics forecasting:

1. `sql.stats.forecasts.min_observations` is the minimum number of
   observed statistics required to produce a forecast.

2. `sql.stats.forecasts.min_goodness_of_fit` is the minimum R² (goodness
   of fit) measurement required from all predictive models to use a
   forecast.

3. `sql.stats.forecasts.max_decrease` is the most a prediction can
   decrease, expressed as the minimum ratio of the prediction to the
   lowest prior observation.
  • Loading branch information
michae2 committed Apr 26, 2024
1 parent 78557f1 commit ae52031
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 10 deletions.
3 changes: 3 additions & 0 deletions docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ sql.stats.cleanup.recurrence string @hourly cron-tab recurrence for SQL Stats cl
sql.stats.flush.enabled boolean true if set, SQL execution statistics are periodically flushed to disk
sql.stats.flush.interval duration 10m0s the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour
sql.stats.forecasts.enabled boolean true when true, enables generation of statistics forecasts by default for all tables
sql.stats.forecasts.max_decrease float 0 the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observation
sql.stats.forecasts.min_goodness_of_fit float 0.95 the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast
sql.stats.forecasts.min_observations integer 3 the mimimum number of observed statistics required to produce a statistics forecast
sql.stats.histogram_buckets.count integer 200 maximum number of histogram buckets to build during table statistics collection
sql.stats.histogram_collection.enabled boolean true histogram collection mode
sql.stats.histogram_samples.count integer 10000 number of rows sampled for histogram construction during table statistics collection
Expand Down
3 changes: 3 additions & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@
<tr><td><div id="setting-sql-stats-flush-enabled" class="anchored"><code>sql.stats.flush.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, SQL execution statistics are periodically flushed to disk</td></tr>
<tr><td><div id="setting-sql-stats-flush-interval" class="anchored"><code>sql.stats.flush.interval</code></div></td><td>duration</td><td><code>10m0s</code></td><td>the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour</td></tr>
<tr><td><div id="setting-sql-stats-forecasts-enabled" class="anchored"><code>sql.stats.forecasts.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>when true, enables generation of statistics forecasts by default for all tables</td></tr>
<tr><td><div id="setting-sql-stats-forecasts-max-decrease" class="anchored"><code>sql.stats.forecasts.max_decrease</code></div></td><td>float</td><td><code>0</code></td><td>the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observation</td></tr>
<tr><td><div id="setting-sql-stats-forecasts-min-goodness-of-fit" class="anchored"><code>sql.stats.forecasts.min_goodness_of_fit</code></div></td><td>float</td><td><code>0.95</code></td><td>the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast</td></tr>
<tr><td><div id="setting-sql-stats-forecasts-min-observations" class="anchored"><code>sql.stats.forecasts.min_observations</code></div></td><td>integer</td><td><code>3</code></td><td>the mimimum number of observed statistics required to produce a statistics forecast</td></tr>
<tr><td><div id="setting-sql-stats-histogram-buckets-count" class="anchored"><code>sql.stats.histogram_buckets.count</code></div></td><td>integer</td><td><code>200</code></td><td>maximum number of histogram buckets to build during table statistics collection</td></tr>
<tr><td><div id="setting-sql-stats-histogram-collection-enabled" class="anchored"><code>sql.stats.histogram_collection.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>histogram collection mode</td></tr>
<tr><td><div id="setting-sql-stats-histogram-samples-count" class="anchored"><code>sql.stats.histogram_samples.count</code></div></td><td>integer</td><td><code>10000</code></td><td>number of rows sampled for histogram construction during table statistics collection</td></tr>
Expand Down
131 changes: 130 additions & 1 deletion pkg/sql/opt/exec/execbuilder/testdata/forecast
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ SHOW CLUSTER SETTING sql.stats.forecasts.enabled
statement ok
SET CLUSTER SETTING sql.stats.forecasts.enabled = true

statement ok
SET CLUSTER SETTING sql.stats.forecasts.max_decrease = 0.3333333333333333

statement ok
CREATE TABLE g (a INT PRIMARY KEY) WITH (sql_stats_automatic_collection_enabled = false)

Expand Down Expand Up @@ -2367,6 +2370,132 @@ vectorized: true
table: st@st_s_idx
spans: [/0 - /9]

# Finally, restore forecasts setting to its previous value.
# Check the effect of various cluster settings.

statement ok
CREATE TABLE v (v INT PRIMARY KEY)

statement ok
ALTER TABLE v INJECT STATISTICS '[
{
"avg_size": 1,
"columns": [
"v"
],
"created_at": "2023-04-18 00:00:00",
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 2,
"name": "__auto__",
"null_count": 300,
"row_count": 300
}
]'

query TTTIIII
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST]
ORDER BY created
----
__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1

statement ok
SET CLUSTER SETTING sql.stats.forecasts.min_observations = 1

query TTTIIII
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST]
ORDER BY created
----
__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1
__forecast__ {v} 2023-04-18 12:00:00 +0000 +0000 300 1 300 0

statement ok
RESET CLUSTER SETTING sql.stats.forecasts.min_observations

statement ok
ALTER TABLE v INJECT STATISTICS '[
{
"avg_size": 1,
"columns": [
"v"
],
"created_at": "2023-04-18 00:00:00",
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 2,
"name": "__auto__",
"null_count": 300,
"row_count": 300
},
{
"avg_size": 1,
"columns": [
"v"
],
"created_at": "2023-04-19 00:00:00",
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 2,
"name": "__auto__",
"null_count": 220,
"row_count": 220
},
{
"avg_size": 1,
"columns": [
"v"
],
"created_at": "2023-04-20 00:00:00",
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 2,
"name": "__auto__",
"null_count": 30,
"row_count": 30
}
]'

query TTTIIII
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST]
ORDER BY created
----
__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1
__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1
__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1

statement ok
SET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit = 0.9

query TTTIIII
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST]
ORDER BY created
----
__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1
__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1
__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1
__forecast__ {v} 2023-04-21 00:00:00 +0000 +0000 10 1 10 0

statement ok
SET CLUSTER SETTING sql.stats.forecasts.max_decrease = 0.1

query TTTIIII
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST]
ORDER BY created
----
__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1
__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1
__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1
__forecast__ {v} 2023-04-21 00:00:00 +0000 +0000 3 1 3 0

statement ok
RESET CLUSTER SETTING sql.stats.forecasts.max_decrease

statement ok
RESET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit

statement ok
SET CLUSTER SETTING sql.stats.forecasts.enabled = $forecastsEnabledPrev
54 changes: 46 additions & 8 deletions pkg/sql/stats/forecast.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,34 @@ var UseStatisticsForecasts = settings.RegisterBoolSetting(
// required to produce a statistics forecast. Forecasts based on 1 or 2
// observations will always have R² = 1 (perfect goodness of fit) regardless of
// the accuracy of the forecast.
const minObservationsForForecast = 3
var minObservationsForForecast = settings.RegisterIntSetting(
settings.TenantWritable,
"sql.stats.forecasts.min_observations",
"the mimimum number of observed statistics required to produce a statistics forecast",
3,
func(v int64) error {
if v < 1 || v > math.MaxInt {
return errors.Errorf("expected value in range [%d, %d], got %d", 1, math.MaxInt, v)
}
return nil
},
).WithPublic()

// minGoodnessOfFit is the minimum R² (goodness of fit) measurement all
// predictive models in a forecast must have for us to use the forecast.
const minGoodnessOfFit = 0.95
var minGoodnessOfFit = settings.RegisterFloatSetting(
settings.TenantWritable,
"sql.stats.forecasts.min_goodness_of_fit",
"the minimum R² (goodness of fit) measurement required from all predictive models to use a "+
"forecast",
0.95,
func(v float64) error {
if v < 0 || v > 1 {
return errors.Errorf("expected value in range [%f, %f], got %f", 0.0, 1.0, v)
}
return nil
},
).WithPublic()

// maxDecrease is the minimum ratio of a prediction to the lowest prior
// observation that we allow. Predictions falling below this will be clamped to
Expand All @@ -58,7 +81,19 @@ const minGoodnessOfFit = 0.95
//
// This happens to be the same as unknownFilterSelectivity, but there's not a
// strong theoretical reason for it.
const maxDecrease = 1.0 / 3.0
var maxDecrease = settings.RegisterFloatSetting(
settings.TenantWritable,
"sql.stats.forecasts.max_decrease",
"the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction "+
"to the lowest prior observation",
0,
func(v float64) error {
if v < 0 || v > 1 {
return errors.Errorf("expected value in range [%f, %f], got %f", 0.0, 1.0, v)
}
return nil
},
).WithPublic()

// TODO(michae2): Consider whether we need a corresponding maxIncrease.

Expand Down Expand Up @@ -124,7 +159,9 @@ func ForecastTableStatistics(
latest := observedByCols[colKey][0].CreatedAt
at := latest.Add(avgRefresh)

forecast, err := forecastColumnStatistics(ctx, sv, observedByCols[colKey], at, minGoodnessOfFit)
forecast, err := forecastColumnStatistics(
ctx, sv, observedByCols[colKey], at, minGoodnessOfFit.Get(sv),
)
if err != nil {
log.VEventf(
ctx, 2, "could not forecast statistics for table %v columns %s: %v",
Expand Down Expand Up @@ -162,7 +199,7 @@ func forecastColumnStatistics(
at time.Time,
minRequiredFit float64,
) (forecast *TableStatistic, err error) {
if len(observed) < minObservationsForForecast {
if len(observed) < int(minObservationsForForecast.Get(sv)) {
return nil, errors.New("not enough observations to forecast statistics")
}

Expand Down Expand Up @@ -220,7 +257,7 @@ func forecastColumnStatistics(
lowestObservation = yᵢ
}
}
lowerBound := math.Round(lowestObservation * maxDecrease)
lowerBound := math.Round(lowestObservation * maxDecrease.Get(sv))
lowerBound = math.Max(0, lowerBound)
if yₙ < lowerBound {
return lowerBound, nil
Expand Down Expand Up @@ -296,7 +333,7 @@ func forecastColumnStatistics(
// histogram. NOTE: If any of the observed histograms were for inverted
// indexes this will produce an incorrect histogram.
if observed[0].HistogramData != nil && observed[0].HistogramData.ColumnType != nil {
hist, err := predictHistogram(ctx, observed, forecastAt, minRequiredFit, nonNullRowCount)
hist, err := predictHistogram(ctx, sv, observed, forecastAt, minRequiredFit, nonNullRowCount)
if err != nil {
// If we did not successfully predict a histogram then copy the latest
// histogram so we can adjust it.
Expand Down Expand Up @@ -382,6 +419,7 @@ func forecastColumnStatistics(
// predictHistogram tries to predict the histogram at forecast time.
func predictHistogram(
ctx context.Context,
sv *settings.Values,
observed []*TableStatistic,
forecastAt float64,
minRequiredFit float64,
Expand Down Expand Up @@ -427,7 +465,7 @@ func predictHistogram(
quantiles = append(quantiles, q)
}

if len(quantiles) < minObservationsForForecast {
if len(quantiles) < int(minObservationsForForecast.Get(sv)) {
return histogram{}, errors.New("not enough observations to forecast histogram")
}

Expand Down
5 changes: 4 additions & 1 deletion pkg/sql/stats/forecast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"time"

"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/sem/catid"
"github.com/cockroachdb/cockroach/pkg/sql/types"
Expand Down Expand Up @@ -605,6 +606,8 @@ func TestForecastColumnStatistics(t *testing.T) {
},
}
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
maxDecrease.Override(ctx, &st.SV, 1.0/3.0)
var fullStatID, partialStatID uint64
for i, tc := range testCases {
t.Run(strconv.Itoa(i), func(t *testing.T) {
Expand All @@ -617,7 +620,7 @@ func TestForecastColumnStatistics(t *testing.T) {
expected := tc.forecast.toTableStatistic(jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID)
at := testStatTime(tc.at)

forecast, err := forecastColumnStatistics(ctx, nil /* sv */, observed, at, 1)
forecast, err := forecastColumnStatistics(ctx, &st.SV, observed, at, 1)
if err != nil {
if !tc.err {
t.Errorf("test case %d unexpected forecastColumnStatistics err: %v", i, err)
Expand Down

0 comments on commit ae52031

Please sign in to comment.