diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 4187cb80b5c9..68e149ae26fa 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -273,6 +273,9 @@ sql.stats.cleanup.recurrence string @hourly cron-tab recurrence for SQL Stats cl sql.stats.flush.enabled boolean true if set, SQL execution statistics are periodically flushed to disk sql.stats.flush.interval duration 10m0s the interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour sql.stats.forecasts.enabled boolean true when true, enables generation of statistics forecasts by default for all tables +sql.stats.forecasts.max_decrease float 0 the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observation +sql.stats.forecasts.min_goodness_of_fit float 0.95 the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast +sql.stats.forecasts.min_observations integer 3 the mimimum number of observed statistics required to produce a statistics forecast sql.stats.histogram_buckets.count integer 200 maximum number of histogram buckets to build during table statistics collection sql.stats.histogram_collection.enabled boolean true histogram collection mode sql.stats.histogram_samples.count integer 10000 number of rows sampled for histogram construction during table statistics collection diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index dfde2e05655f..b3ff176bd2e8 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -225,6 +225,9 @@
sql.stats.flush.enabled
booleantrueif set, SQL execution statistics are periodically flushed to disk
sql.stats.flush.interval
duration10m0sthe interval at which SQL execution statistics are flushed to disk, this value must be less than or equal to 1 hour
sql.stats.forecasts.enabled
booleantruewhen true, enables generation of statistics forecasts by default for all tables +
sql.stats.forecasts.max_decrease
float0the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction to the lowest prior observation +
sql.stats.forecasts.min_goodness_of_fit
float0.95the minimum R² (goodness of fit) measurement required from all predictive models to use a forecast +
sql.stats.forecasts.min_observations
integer3the mimimum number of observed statistics required to produce a statistics forecast
sql.stats.histogram_buckets.count
integer200maximum number of histogram buckets to build during table statistics collection
sql.stats.histogram_collection.enabled
booleantruehistogram collection mode
sql.stats.histogram_samples.count
integer10000number of rows sampled for histogram construction during table statistics collection diff --git a/pkg/sql/opt/exec/execbuilder/testdata/forecast b/pkg/sql/opt/exec/execbuilder/testdata/forecast index d38d1168628e..c7be3ca47f63 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/forecast +++ b/pkg/sql/opt/exec/execbuilder/testdata/forecast @@ -12,6 +12,9 @@ SHOW CLUSTER SETTING sql.stats.forecasts.enabled statement ok SET CLUSTER SETTING sql.stats.forecasts.enabled = true +statement ok +SET CLUSTER SETTING sql.stats.forecasts.max_decrease = 0.3333333333333333 + statement ok CREATE TABLE g (a INT PRIMARY KEY) WITH (sql_stats_automatic_collection_enabled = false) @@ -378,7 +381,7 @@ ORDER BY created __auto__ {b} 1988-08-05 00:00:00 +0000 +0000 9 9 0 1 __auto__ {b} 1988-08-06 00:00:00 +0000 +0000 6 6 0 1 __auto__ {b} 1988-08-07 00:00:00 +0000 +0000 3 3 0 1 -__forecast__ {b} 1988-08-08 00:00:00 +0000 +0000 0 0 0 1 +__forecast__ {b} 1988-08-08 00:00:00 +0000 +0000 1 1 0 1 query T SELECT jsonb_pretty(stat) @@ -394,12 +397,32 @@ WHERE stat->>'name' = '__forecast__' "b" ], "created_at": "1988-08-08 00:00:00", - "distinct_count": 0, + "distinct_count": 1, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 0, + "num_range": 0, + "upper_bound": "-1" + }, + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 0, + "num_eq": 0, + "num_range": 0, + "upper_bound": "1" + } + ], "histo_col_type": "INT8", "histo_version": 2, "name": "__forecast__", "null_count": 0, - "row_count": 0 + "row_count": 1 } query T @@ -421,7 +444,8 @@ scan s ├── constraint: /1: [/0 - /99] ├── cardinality: [0 - 100] ├── stats: [rows=1, distinct(1)=1, null(1)=0] - │ histogram(1)= + │ histogram(1)= 0 1 0 0 + │ <--- 0 --- 1 ├── cost: 14.02 ├── key: (1) └── distribution: test @@ -1057,7 +1081,8 @@ scan s ├── columns: b:1 ├── constraint: /1: [ - /2] ├── stats: [rows=1, distinct(1)=1, null(1)=0] - │ histogram(1)= + │ histogram(1)= 0 1 0 0 + │ <--- 0 --- 1 ├── cost: 19.03 ├── key: (1) └── distribution: test @@ -1850,6 +1875,627 @@ vectorized: true table: t_103958@t_103958_pkey spans: FULL SCAN -# Finally, restore forecasts setting to its previous value. +# Test for issue #119967: check that we don't prematurely estimate 0 rows from a +# downward trend. + +statement ok +CREATE TABLE st (s int, t timestamptz, INDEX (s), INDEX (t)) +WITH (sql_stats_histogram_buckets_count=4) + +# Inject statistics from: +# INSERT INTO st SELECT generate_series(0, 99999), NULL; +# and then a steady rate of updates over time filling in column t: +# UPDATE st SET t = current_timestamp() WHERE s >= $1 AND s < $1 + 10 AND t IS NULL; +statement ok +ALTER TABLE st INJECT STATISTICS '[ + { + "avg_size": 4, + "columns": [ + "s" + ], + "created_at": "2024-04-09 16:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "33333" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "66666" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "99999" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 0, + "columns": [ + "t" + ], + "created_at": "2024-04-09 16:00:00.000000", + "distinct_count": 1, + "histo_col_type": "TIMESTAMPTZ", + "histo_version": 2, + "name": "__auto__", + "null_count": 100000, + "row_count": 100000 + }, + { + "avg_size": 9, + "columns": [ + "rowid" + ], + "created_at": "2024-04-09 16:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "958826497540358145" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826498669838337" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826500705484801" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826503229538305" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 4, + "columns": [ + "s" + ], + "created_at": "2024-04-09 17:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "33333" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "66666" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "99999" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 3, + "columns": [ + "t" + ], + "created_at": "2024-04-09 17:00:00.000000", + "distinct_count": 2501, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "2024-04-09 16:00:00.000000+00" + }, + { + "distinct_range": 832, + "num_eq": 10, + "num_range": 8320, + "upper_bound": "2024-04-09 16:20:00.000000+00" + }, + { + "distinct_range": 832, + "num_eq": 10, + "num_range": 8320, + "upper_bound": "2024-04-09 16:40:00.000000+00" + }, + { + "distinct_range": 832, + "num_eq": 10, + "num_range": 8320, + "upper_bound": "2024-04-09 17:00:00.000000+00" + } + ], + "histo_col_type": "TIMESTAMPTZ", + "histo_version": 2, + "name": "__auto__", + "null_count": 75000, + "row_count": 100000 + }, + { + "avg_size": 9, + "columns": [ + "rowid" + ], + "created_at": "2024-04-09 17:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "958826497540358145" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826498669838337" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826500705484801" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826503229538305" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 4, + "columns": [ + "s" + ], + "created_at": "2024-04-09 18:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "33333" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "66666" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "99999" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 6, + "columns": [ + "t" + ], + "created_at": "2024-04-09 18:00:00.000000", + "distinct_count": 5001, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "2024-04-09 16:00:00.000000+00" + }, + { + "distinct_range": 1666, + "num_eq": 10, + "num_range": 16660, + "upper_bound": "2024-04-09 16:40:00.000000+00" + }, + { + "distinct_range": 1665, + "num_eq": 10, + "num_range": 16650, + "upper_bound": "2024-04-09 17:20:00.000000+00" + }, + { + "distinct_range": 1665, + "num_eq": 10, + "num_range": 16650, + "upper_bound": "2024-04-09 18:00:00.000000+00" + } + ], + "histo_col_type": "TIMESTAMPTZ", + "histo_version": 2, + "name": "__auto__", + "null_count": 50000, + "row_count": 100000 + }, + { + "avg_size": 9, + "columns": [ + "rowid" + ], + "created_at": "2024-04-09 18:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "958826497540358145" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826498669838337" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826500705484801" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826503229538305" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 4, + "columns": [ + "s" + ], + "created_at": "2024-04-09 19:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "33333" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "66666" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "99999" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + }, + { + "avg_size": 9, + "columns": [ + "t" + ], + "created_at": "2024-04-09 19:00:00.000000", + "distinct_count": 7501, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "2024-04-09 16:00:00.000000+00" + }, + { + "distinct_range": 2499, + "num_eq": 10, + "num_range": 24990, + "upper_bound": "2024-04-09 17:00:00.000000+00" + }, + { + "distinct_range": 2499, + "num_eq": 10, + "num_range": 24990, + "upper_bound": "2024-04-09 18:00:00.000000+00" + }, + { + "distinct_range": 2498, + "num_eq": 10, + "num_range": 24980, + "upper_bound": "2024-04-09 19:00:00.000000+00" + } + ], + "histo_col_type": "TIMESTAMPTZ", + "histo_version": 2, + "name": "__auto__", + "null_count": 25000, + "row_count": 100000 + }, + { + "avg_size": 9, + "columns": [ + "rowid" + ], + "created_at": "2024-04-09 19:00:00.000000", + "distinct_count": 100000, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "958826497540358145" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826498669838337" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826500705484801" + }, + { + "distinct_range": 33332, + "num_eq": 1, + "num_range": 33332, + "upper_bound": "958826503229538305" + } + ], + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 0, + "row_count": 100000 + } +]' + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE st WITH FORECAST] +WHERE column_names = ARRAY['t'] +ORDER BY created +---- +__auto__ {t} 2024-04-09 16:00:00 +0000 +0000 100000 1 100000 0 +__auto__ {t} 2024-04-09 17:00:00 +0000 +0000 100000 2501 75000 3 +__auto__ {t} 2024-04-09 18:00:00 +0000 +0000 100000 5001 50000 6 +__auto__ {t} 2024-04-09 19:00:00 +0000 +0000 100000 7501 25000 9 +__forecast__ {t} 2024-04-09 20:00:00 +0000 +0000 100000 10001 8333 12 + +query T +EXPLAIN UPDATE st SET t = '2024-04-09 19:05:00.000000' WHERE s >= 0 AND s < 10 AND t IS NULL +---- +distribution: local +vectorized: true +· +• update +│ table: st +│ set: t +│ auto commit +│ +└── • render + │ + └── • filter + │ estimated row count: 9 + │ filter: t IS NULL + │ + └── • index join + │ estimated row count: 10 + │ table: st@st_pkey + │ + └── • scan + estimated row count: 10 (0.01% of the table; stats collected ago; using stats forecast) + table: st@st_s_idx + spans: [/0 - /9] + +# Check the effect of various cluster settings. + +statement ok +CREATE TABLE v (v INT PRIMARY KEY) + +statement ok +ALTER TABLE v INJECT STATISTICS '[ + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-18 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 300, + "row_count": 300 + } +]' + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.min_observations = 1 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1 +__forecast__ {v} 2023-04-18 12:00:00 +0000 +0000 300 1 300 0 + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.min_observations + +statement ok +ALTER TABLE v INJECT STATISTICS '[ + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-18 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 300, + "row_count": 300 + }, + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-19 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 220, + "row_count": 220 + }, + { + "avg_size": 1, + "columns": [ + "v" + ], + "created_at": "2023-04-20 00:00:00", + "distinct_count": 1, + "histo_col_type": "INT8", + "histo_version": 2, + "name": "__auto__", + "null_count": 30, + "row_count": 30 + } +]' + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit = 0.9 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1 +__forecast__ {v} 2023-04-21 00:00:00 +0000 +0000 10 1 10 0 + +statement ok +SET CLUSTER SETTING sql.stats.forecasts.max_decrease = 0.1 + +query TTTIIII +SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size +FROM [SHOW STATISTICS FOR TABLE v WITH FORECAST] +ORDER BY created +---- +__auto__ {v} 2023-04-18 00:00:00 +0000 +0000 300 1 300 1 +__auto__ {v} 2023-04-19 00:00:00 +0000 +0000 220 1 220 1 +__auto__ {v} 2023-04-20 00:00:00 +0000 +0000 30 1 30 1 +__forecast__ {v} 2023-04-21 00:00:00 +0000 +0000 3 1 3 0 + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.max_decrease + +statement ok +RESET CLUSTER SETTING sql.stats.forecasts.min_goodness_of_fit + statement ok SET CLUSTER SETTING sql.stats.forecasts.enabled = $forecastsEnabledPrev diff --git a/pkg/sql/stats/forecast.go b/pkg/sql/stats/forecast.go index fd6cfae44323..6e836f32a183 100644 --- a/pkg/sql/stats/forecast.go +++ b/pkg/sql/stats/forecast.go @@ -44,11 +44,58 @@ var UseStatisticsForecasts = settings.RegisterBoolSetting( // required to produce a statistics forecast. Forecasts based on 1 or 2 // observations will always have R² = 1 (perfect goodness of fit) regardless of // the accuracy of the forecast. -const minObservationsForForecast = 3 +var minObservationsForForecast = settings.RegisterIntSetting( + settings.TenantWritable, + "sql.stats.forecasts.min_observations", + "the mimimum number of observed statistics required to produce a statistics forecast", + 3, + func(v int64) error { + if v < 1 || v > math.MaxInt { + return errors.Errorf("expected value in range [%d, %d], got %d", 1, math.MaxInt, v) + } + return nil + }, +).WithPublic() // minGoodnessOfFit is the minimum R² (goodness of fit) measurement all // predictive models in a forecast must have for us to use the forecast. -const minGoodnessOfFit = 0.95 +var minGoodnessOfFit = settings.RegisterFloatSetting( + settings.TenantWritable, + "sql.stats.forecasts.min_goodness_of_fit", + "the minimum R² (goodness of fit) measurement required from all predictive models to use a "+ + "forecast", + 0.95, + func(v float64) error { + if v < 0 || v > 1 { + return errors.Errorf("expected value in range [%f, %f], got %f", 0.0, 1.0, v) + } + return nil + }, +).WithPublic() + +// maxDecrease is the minimum ratio of a prediction to the lowest prior +// observation that we allow. Predictions falling below this will be clamped to +// the lower bound calculated from this ratio. This lower bound is needed to +// prevent forecasting zero rows for downward-trending statistics, which can +// cause bad plans when the forecast is initially used. +// +// This happens to be the same as unknownFilterSelectivity, but there's not a +// strong theoretical reason for it. +var maxDecrease = settings.RegisterFloatSetting( + settings.TenantWritable, + "sql.stats.forecasts.max_decrease", + "the most a prediction is allowed to decrease, expressed as the minimum ratio of the prediction "+ + "to the lowest prior observation", + 0, + func(v float64) error { + if v < 0 || v > 1 { + return errors.Errorf("expected value in range [%f, %f], got %f", 0.0, 1.0, v) + } + return nil + }, +).WithPublic() + +// TODO(michae2): Consider whether we need a corresponding maxIncrease. // ForecastTableStatistics produces zero or more statistics forecasts based on // the given observed statistics. The observed statistics must be ordered by @@ -112,7 +159,9 @@ func ForecastTableStatistics( latest := observedByCols[colKey][0].CreatedAt at := latest.Add(avgRefresh) - forecast, err := forecastColumnStatistics(ctx, sv, observedByCols[colKey], at, minGoodnessOfFit) + forecast, err := forecastColumnStatistics( + ctx, sv, observedByCols[colKey], at, minGoodnessOfFit.Get(sv), + ) if err != nil { log.VEventf( ctx, 2, "could not forecast statistics for table %v columns %s: %v", @@ -150,7 +199,7 @@ func forecastColumnStatistics( at time.Time, minRequiredFit float64, ) (forecast *TableStatistic, err error) { - if len(observed) < minObservationsForForecast { + if len(observed) < int(minObservationsForForecast.Get(sv)) { return nil, errors.New("not enough observations to forecast statistics") } @@ -197,9 +246,21 @@ func forecastColumnStatistics( "predicted %v R² %v below min required R² %v", name, r2, minRequiredFit, ) } - // Clamp the predicted value to [0, MaxInt64] and round to nearest integer. - if yₙ < 0 { - return 0, nil + // Clamp the predicted value to [lowerBound, MaxInt64] and round to nearest + // integer. In general, it is worse to under-estimate counts than to + // over-estimate counts, so we pick a very conservative lowerBound of the + // prior lowest observation times maxDecrease to avoid prematurely + // estimating zero rows for downward-trending statistics. + lowestObservation := math.MaxFloat64 + for _, yᵢ := range y { + if yᵢ < lowestObservation { + lowestObservation = yᵢ + } + } + lowerBound := math.Round(lowestObservation * maxDecrease.Get(sv)) + lowerBound = math.Max(0, lowerBound) + if yₙ < lowerBound { + return lowerBound, nil } if yₙ > math.MaxInt64 { return math.MaxInt64, nil @@ -272,7 +333,7 @@ func forecastColumnStatistics( // histogram. NOTE: If any of the observed histograms were for inverted // indexes this will produce an incorrect histogram. if observed[0].HistogramData != nil && observed[0].HistogramData.ColumnType != nil { - hist, err := predictHistogram(ctx, observed, forecastAt, minRequiredFit, nonNullRowCount) + hist, err := predictHistogram(ctx, sv, observed, forecastAt, minRequiredFit, nonNullRowCount) if err != nil { // If we did not successfully predict a histogram then copy the latest // histogram so we can adjust it. @@ -358,6 +419,7 @@ func forecastColumnStatistics( // predictHistogram tries to predict the histogram at forecast time. func predictHistogram( ctx context.Context, + sv *settings.Values, observed []*TableStatistic, forecastAt float64, minRequiredFit float64, @@ -403,7 +465,7 @@ func predictHistogram( quantiles = append(quantiles, q) } - if len(quantiles) < minObservationsForForecast { + if len(quantiles) < int(minObservationsForForecast.Get(sv)) { return histogram{}, errors.New("not enough observations to forecast histogram") } diff --git a/pkg/sql/stats/forecast_test.go b/pkg/sql/stats/forecast_test.go index 7ac9e132e4c2..03a8f9eaa3a9 100644 --- a/pkg/sql/stats/forecast_test.go +++ b/pkg/sql/stats/forecast_test.go @@ -18,6 +18,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/types" @@ -186,7 +187,7 @@ func TestForecastColumnStatistics(t *testing.T) { {at: 7, row: 25, dist: 5, null: 0, size: 1}, }, at: 11, - forecast: &testStat{at: 11, row: 25, dist: 1, null: 0, size: 1}, + forecast: &testStat{at: 11, row: 25, dist: 2, null: 0, size: 1}, }, // Growing AvgSize { @@ -206,7 +207,7 @@ func TestForecastColumnStatistics(t *testing.T) { {at: 6, row: 10, dist: 8, null: 0, size: 10}, }, at: 9, - forecast: &testStat{at: 9, row: 10, dist: 8, null: 0, size: 0}, + forecast: &testStat{at: 9, row: 10, dist: 8, null: 0, size: 3}, }, // Growing from empty table { @@ -434,8 +435,8 @@ func TestForecastColumnStatistics(t *testing.T) { }, at: 11, forecast: &testStat{ - at: 11, row: 25, dist: 1, null: 0, size: 1, - hist: testHistogram{{25, 0, 0, 404}}, + at: 11, row: 25, dist: 2, null: 0, size: 1, + hist: testHistogram{{13, 0, 0, 404}, {0, 12, 1, 500}}, }, }, // Histogram, growing from empty table @@ -605,6 +606,8 @@ func TestForecastColumnStatistics(t *testing.T) { }, } ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + maxDecrease.Override(ctx, &st.SV, 1.0/3.0) var fullStatID, partialStatID uint64 for i, tc := range testCases { t.Run(strconv.Itoa(i), func(t *testing.T) { @@ -617,7 +620,7 @@ func TestForecastColumnStatistics(t *testing.T) { expected := tc.forecast.toTableStatistic(jobspb.ForecastStatsName, i, descpb.ColumnIDs{1}, fullStatID, partialStatID) at := testStatTime(tc.at) - forecast, err := forecastColumnStatistics(ctx, nil /* sv */, observed, at, 1) + forecast, err := forecastColumnStatistics(ctx, &st.SV, observed, at, 1) if err != nil { if !tc.err { t.Errorf("test case %d unexpected forecastColumnStatistics err: %v", i, err)