From b9e46d68b6b14c4e5a6d2a8046d65f437d86d8db Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Sun, 21 Apr 2024 23:09:14 -0700 Subject: [PATCH] sql/stats: turn forecasting constants into cluster settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn three constants used for statistics forecasting into cluster settings. (These must be cluster settings rather than session variables because forecasting happens when reloading table statistics in the stats cache, which can be independent of any session.) Informs: #119967 Release note (sql change): Introduce three new cluster settings for controlling table statistics forecasting: 1. `sql.stats.forecasts.min_observations` is the minimum number of observed statistics required to produce a forecast. 2. `sql.stats.forecasts.min_goodness_of_fit` is the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast. 3. `sql.stats.forecasts.max_decrease` is the most a prediction can decrease, expressed as the minimum ratio of the prediction to the lowest prior observation. --- .../settings/settings-for-tenants.txt | 3 + docs/generated/settings/settings.html | 3 + .../opt/exec/execbuilder/testdata/forecast | 128 +++++++++++++++++- pkg/sql/stats/forecast.go | 42 ++++-- 4 files changed, 167 insertions(+), 9 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 6b20721df9d9..1604d184afff 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -301,6 +301,9 @@ sql.stats.cleanup.recurrence string @hourly cron-tab recurrence for SQL Stats cl sql.stats.flush.enabled boolean true if set, SQL execution statistics are periodically flushed to disk application sql.stats.flush.interval duration 10m0s the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour application sql.stats.forecasts.enabled boolean true when true, enables generation of statistics forecasts by default for all tables application +sql.stats.forecasts.max_decrease float 0.3333333333333333 the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observation application +sql.stats.forecasts.min_goodness_of_fit float 0.95 the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast application +sql.stats.forecasts.min_observations integer 3 the mimimum number of observed statistics required to produce a statistics forecast application sql.stats.histogram_buckets.count integer 200 maximum number of histogram buckets to build during table statistics collection application sql.stats.histogram_collection.enabled boolean true histogram collection mode application sql.stats.histogram_samples.count integer 10000 number of rows sampled for histogram construction during table statistics collection application diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 1104d5db0203..0dc8f1426250 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -252,6 +252,9 @@
sql.stats.flush.enabled
booleantrueif set, SQL execution statistics are periodically flushed to diskServerless/Dedicated/Self-Hosted
sql.stats.flush.interval
duration10m0sthe interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hourServerless/Dedicated/Self-Hosted
sql.stats.forecasts.enabled
booleantruewhen true, enables generation of statistics forecasts by default for all tablesServerless/Dedicated/Self-Hosted +
sql.stats.forecasts.max_decrease
float0.3333333333333333the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observationServerless/Dedicated/Self-Hosted +
sql.stats.forecasts.min_goodness_of_fit
float0.95the minimum R² (goodness of fit) measurement required from all predictive models to use a forecastServerless/Dedicated/Self-Hosted +
sql.stats.forecasts.min_observations
integer3the mimimum number of observed statistics required to produce a statistics forecastServerless/Dedicated/Self-Hosted
sql.stats.histogram_buckets.count
integer200maximum number of histogram buckets to build during table statistics collectionServerless/Dedicated/Self-Hosted
sql.stats.histogram_collection.enabled
booleantruehistogram collection modeServerless/Dedicated/Self-Hosted
sql.stats.histogram_samples.count
integer10000number of rows sampled for histogram construction during table statistics collectionServerless/Dedicated/Self-Hosted diff --git a/pkg/sql/opt/exec/execbuilder/testdata/forecast b/pkg/sql/opt/exec/execbuilder/testdata/forecast index 4fde961dc77d..437605fac31d 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/forecast +++ b/pkg/sql/opt/exec/execbuilder/testdata/forecast @@ -2367,6 +2367,132 @@ vectorized: true table: st@st_s_idx spans: [/0 - /9] -# Finally, restore forecasts setting to its previous value. +# Check the effect of various cluster settings. + +statement ok +CREATE TABLE v (v INT PRIMARY KEY) + +statement ok +ALTER TABLE v INJECT STATISTICS '[ + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-18 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 300, + "row_count": 300 + } +]' + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 UTC 300 1 300 1 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.min_observations = 1 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 UTC 300 1 300 1 +__forecast__ {v} 2023-04-18 12:00:00 +0000 UTC 300 1 300 0 + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.min_observations + +statement ok +ALTER TABLE v INJECT STATISTICS '[ + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-18 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 300, + "row_count": 300 + }, + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-19 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 220, + "row_count": 220 + }, + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-20 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 30, + "row_count": 30 + } +]' + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 UTC 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 UTC 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 UTC 30 1 30 1 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit = 0.9 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 UTC 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 UTC 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 UTC 30 1 30 1 +__forecast__ {v} 2023-04-21 00:00:00 +0000 UTC 10 1 10 0 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.max_decrease = 0.1 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 UTC 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 UTC 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 UTC 30 1 30 1 +__forecast__ {v} 2023-04-21 00:00:00 +0000 UTC 3 1 3 0 + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.max_decrease + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit + statement ok SET CLUSTER SETTING sql.stats.forecasts.enabled = $forecastsEnabledPrev diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go index a6d657f422b6..8030af383f84 100644 --- a/pkg/sql/stats/forecast.go +++ b/pkg/sql/stats/forecast.go @@ -47,11 +47,26 @@ var UseStatisticsForecasts = settings.RegisterBoolSetting( // required to produce a statistics forecast. Forecasts based on 1 or 2 // observations will always have R² = 1 (perfect goodness of fit) regardless of // the accuracy of the forecast. -const minObservationsForForecast = 3 +var minObservationsForForecast = settings.RegisterIntSetting( + settings.ApplicationLevel, + "sql.stats.forecasts.min_observations", + "the mimimum number of observed statistics required to produce a statistics forecast", + 3, + settings.WithPublic, + settings.IntInRange(1, math.MaxInt), +) // minGoodnessOfFit is the minimum R² (goodness of fit) measurement all // predictive models in a forecast must have for us to use the forecast. -const minGoodnessOfFit = 0.95 +var minGoodnessOfFit = settings.RegisterFloatSetting( + settings.ApplicationLevel, + "sql.stats.forecasts.min_goodness_of_fit", + "the minimum R² (goodness of fit) measurement required from all predictive models to use a "+ + "forecast", + 0.95, + settings.WithPublic, + settings.Fraction, +) // maxDecrease is the minimum ratio of a prediction to the lowest prior // observation that we allow. Predictions falling below this will be clamped to @@ -61,7 +76,15 @@ const minGoodnessOfFit = 0.95 // // This happens to be the same as unknownFilterSelectivity, but there's not a // strong theoretical reason for it. -const maxDecrease = 1.0 / 3.0 +var maxDecrease = settings.RegisterFloatSetting( + settings.ApplicationLevel, + "sql.stats.forecasts.max_decrease", + "the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction "+ + "to the lowest prior observation", + 1.0/3.0, + settings.WithPublic, + settings.Fraction, +) // TODO(michae2): Consider whether we need a corresponding maxIncrease. @@ -127,7 +150,9 @@ func ForecastTableStatistics( latest := observedByCols[colKey][0].CreatedAt at := latest.Add(avgRefresh) - forecast, err := forecastColumnStatistics(ctx, st, observedByCols[colKey], at, minGoodnessOfFit) + forecast, err := forecastColumnStatistics( + ctx, st, observedByCols[colKey], at, minGoodnessOfFit.Get(&st.SV), + ) if err != nil { log.VEventf( ctx, 2, "could not forecast statistics for table %v columns %s: %v", @@ -165,7 +190,7 @@ func forecastColumnStatistics( at time.Time, minRequiredFit float64, ) (forecast *TableStatistic, err error) { - if len(observed) < minObservationsForForecast { + if len(observed) < int(minObservationsForForecast.Get(&st.SV)) { return nil, errors.New("not enough observations to forecast statistics") } @@ -217,7 +242,7 @@ func forecastColumnStatistics( // over-estimate counts, so we pick a very conservative lowerBound of the // prior lowest observation times maxDecrease to avoid prematurely // estimating zero rows for downward-trending statistics. - lowerBound := math.Round(slices.Min(y) * maxDecrease) + lowerBound := math.Round(slices.Min(y) * maxDecrease.Get(&st.SV)) lowerBound = math.Max(0, lowerBound) if yₙ < lowerBound { return lowerBound, nil @@ -293,7 +318,7 @@ func forecastColumnStatistics( // histogram. NOTE: If any of the observed histograms were for inverted // indexes this will produce an incorrect histogram. if observed[0].HistogramData != nil && observed[0].HistogramData.ColumnType != nil { - hist, err := predictHistogram(ctx, observed, forecastAt, minRequiredFit, nonNullRowCount) + hist, err := predictHistogram(ctx, st, observed, forecastAt, minRequiredFit, nonNullRowCount) if err != nil { // If we did not successfully predict a histogram then copy the latest // histogram so we can adjust it. @@ -379,6 +404,7 @@ func forecastColumnStatistics( // predictHistogram tries to predict the histogram at forecast time. func predictHistogram( ctx context.Context, + st *cluster.Settings, observed []*TableStatistic, forecastAt float64, minRequiredFit float64, @@ -424,7 +450,7 @@ func predictHistogram( quantiles = append(quantiles, q) } - if len(quantiles) < minObservationsForForecast { + if len(quantiles) < int(minObservationsForForecast.Get(&st.SV)) { return histogram{}, errors.New("not enough observations to forecast histogram") }