From 5c178e53d17ce33fcb7c7126e7603f7bfb63b399 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Tue, 7 Mar 2023 19:58:27 -0600 Subject: [PATCH] opt: add setting to always use histograms to calculate stats Informs #64570 Release note (sql change): Added a new session setting, optimizer_always_use_histograms, which ensures that the optimizer always uses histograms when available to calculate the statistics of every plan that it explores. Enabling this setting can prevent the optimizer from choosing a suboptimal index when statistics for a table are stale. --- pkg/sql/exec_util.go | 4 + .../testdata/logic_test/information_schema | 1 + .../logictest/testdata/logic_test/pg_catalog | 3 + .../logictest/testdata/logic_test/show_source | 1 + pkg/sql/opt/memo/memo.go | 5 +- pkg/sql/opt/memo/memo_test.go | 6 + pkg/sql/opt/memo/statistics_builder.go | 3 + pkg/sql/opt/memo/testdata/stats/scan | 327 ++++++++++++++++++ .../local_only_session_data.proto | 3 + pkg/sql/vars.go | 17 + 10 files changed, 369 insertions(+), 1 deletion(-) diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 0d0321548a62..47331e13a1f5 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -3298,6 +3298,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedDisjunctionStats(val bool) { m.data.OptimizerUseImprovedDisjunctionStats = val } +func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) { + m.data.OptimizerAlwaysUseHistograms = val +} + // Utility functions related to scrubbing sensitive information on SQL Stats. // quantizeCounts ensures that the Count field in the diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema index 1c1da2946eda..c46b01b15c87 100644 --- a/pkg/sql/logictest/testdata/logic_test/information_schema +++ b/pkg/sql/logictest/testdata/logic_test/information_schema @@ -4773,6 +4773,7 @@ null_ordered_last off on_update_rehome_row_enabled on opt_split_scan_limit 2048 optimizer on +optimizer_always_use_histograms off optimizer_use_histograms on optimizer_use_improved_disjunction_stats off optimizer_use_multicol_stats on diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog index 1db1e8dff3fa..9703823f0d1f 100644 --- a/pkg/sql/logictest/testdata/logic_test/pg_catalog +++ b/pkg/sql/logictest/testdata/logic_test/pg_catalog @@ -4204,6 +4204,7 @@ node_id 1 NULL null_ordered_last off NULL NULL NULL string on_update_rehome_row_enabled on NULL NULL NULL string opt_split_scan_limit 2048 NULL NULL NULL string +optimizer_always_use_histograms off NULL NULL NULL string optimizer_use_histograms on NULL NULL NULL string optimizer_use_improved_disjunction_stats off NULL NULL NULL string optimizer_use_multicol_stats on NULL NULL NULL string @@ -4335,6 +4336,7 @@ node_id 1 NULL null_ordered_last off NULL user NULL off off on_update_rehome_row_enabled on NULL user NULL on on opt_split_scan_limit 2048 NULL user NULL 2048 2048 +optimizer_always_use_histograms off NULL user NULL off off optimizer_use_histograms on NULL user NULL on on optimizer_use_improved_disjunction_stats off NULL user NULL off off optimizer_use_multicol_stats on NULL user NULL on on @@ -4462,6 +4464,7 @@ null_ordered_last NULL NULL NULL on_update_rehome_row_enabled NULL NULL NULL NULL NULL opt_split_scan_limit NULL NULL NULL NULL NULL optimizer NULL NULL NULL NULL NULL +optimizer_always_use_histograms NULL NULL NULL NULL NULL optimizer_use_histograms NULL NULL NULL NULL NULL optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL optimizer_use_multicol_stats NULL NULL NULL NULL NULL diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source index 92b99b0cc68f..2df8990bd5ee 100644 --- a/pkg/sql/logictest/testdata/logic_test/show_source +++ b/pkg/sql/logictest/testdata/logic_test/show_source @@ -102,6 +102,7 @@ node_id 1 null_ordered_last off on_update_rehome_row_enabled on opt_split_scan_limit 2048 +optimizer_always_use_histograms off optimizer_use_histograms on optimizer_use_improved_disjunction_stats off optimizer_use_multicol_stats on diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go index d181ad4a3ec6..ffac0d2ddb09 100644 --- a/pkg/sql/opt/memo/memo.go +++ b/pkg/sql/opt/memo/memo.go @@ -156,6 +156,7 @@ type Memo struct { testingOptimizerCostPerturbation float64 testingOptimizerDisableRuleProbability float64 useImprovedDisjunctionStats bool + alwaysUseHistograms bool // curRank is the highest currently in-use scalar expression rank. curRank opt.ScalarRank @@ -207,6 +208,7 @@ func (m *Memo) Init(evalCtx *tree.EvalContext) { testingOptimizerCostPerturbation: evalCtx.SessionData().TestingOptimizerCostPerturbation, testingOptimizerDisableRuleProbability: evalCtx.SessionData().TestingOptimizerDisableRuleProbability, useImprovedDisjunctionStats: evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats, + alwaysUseHistograms: evalCtx.SessionData().OptimizerAlwaysUseHistograms, } m.metadata.Init() m.logPropsBuilder.init(evalCtx, m) @@ -335,7 +337,8 @@ func (m *Memo) IsStale( m.testingOptimizerRandomSeed != evalCtx.SessionData().TestingOptimizerRandomSeed || m.testingOptimizerCostPerturbation != evalCtx.SessionData().TestingOptimizerCostPerturbation || m.testingOptimizerDisableRuleProbability != evalCtx.SessionData().TestingOptimizerDisableRuleProbability || - m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats { + m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats || + m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms { return true, nil } diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go index 8d74f02610d4..e50d48bed69d 100644 --- a/pkg/sql/opt/memo/memo_test.go +++ b/pkg/sql/opt/memo/memo_test.go @@ -302,6 +302,12 @@ func TestMemoIsStale(t *testing.T) { evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats = false notStale() + // Stale optimizer_always_use_histograms. + evalCtx.SessionData().OptimizerAlwaysUseHistograms = true + stale() + evalCtx.SessionData().OptimizerAlwaysUseHistograms = false + notStale() + // Stale data sources and schema. Create new catalog so that data sources are // recreated and can be modified independently. catalog = testcat.New() diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index ff7464372062..68422b7feb39 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -2740,6 +2740,9 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts( } func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool { + if sb.evalCtx.SessionData().OptimizerAlwaysUseHistograms { + return true + } // If we know that the cardinality is below a certain threshold (e.g., due to // a constraint on a key column), don't bother adding the overhead of // creating a histogram. diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan index 26d7b9e8ca76..b9d993437cae 100644 --- a/pkg/sql/opt/memo/testdata/stats/scan +++ b/pkg/sql/opt/memo/testdata/stats/scan @@ -3083,3 +3083,330 @@ top-k │ <---- 3 -------- 5 ---- ├── key: (6,7) └── fd: ()-->(2,3,5), (6,7)-->(1,4) + +# Tests for when stats are stale. + +exec-ddl +CREATE TABLE stale ( + w STRING PRIMARY KEY, + x STRING, + y STRING, + z STRING, + UNIQUE (x, y), + INDEX (x, z) +) +---- + +exec-ddl +ALTER TABLE stale INJECT STATISTICS '[ + { + "avg_size": 7, + "columns": [ + "x" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo1" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo10" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo2" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo3" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo4" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo5" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo6" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo7" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo8" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo9" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 16, + "columns": [ + "x", + "y" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 110, + "histo_col_type": "", + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 13, + "columns": [ + "x", + "z" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_col_type": "", + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 9, + "columns": [ + "y" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 11, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1000" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1001" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1002" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1003" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1004" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1005" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1006" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1007" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1008" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1009" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1010" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 7, + "columns": [ + "z" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz1" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz10" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz2" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz3" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz4" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz5" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz6" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz7" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz8" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz9" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + } +]' +---- + +# When optimizer_always_use_histograms is disabled, we may choose the non-unique +# index. +opt set=optimizer_always_use_histograms=false +SELECT * FROM stale WHERE x = 'bar1' AND (y = 'bar1000' OR y = 'bar1001') LIMIT 1 +---- +limit + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + ├── cardinality: [0 - 1] + ├── stats: [rows=1] + ├── key: () + ├── fd: ()-->(1-4) + ├── select + │ ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + │ ├── cardinality: [0 - 2] + │ ├── stats: [rows=2, distinct(2)=1, null(2)=0, distinct(3)=2, null(3)=0, distinct(2,3)=2, null(2,3)=0] + │ ├── key: (1) + │ ├── fd: ()-->(2), (1)-->(3,4), (3)-->(1,4) + │ ├── limit hint: 1.00 + │ ├── index-join stale + │ │ ├── columns: w:1(string!null) x:2(string) y:3(string) z:4(string) + │ │ ├── stats: [rows=2.2e-08] + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(3,4), (2,3)~~>(1,4) + │ │ ├── limit hint: 0.00 + │ │ └── scan stale@stale_x_z_idx + │ │ ├── columns: w:1(string!null) x:2(string!null) z:4(string) + │ │ ├── constraint: /2/4/1: [/'bar1' - /'bar1'] + │ │ ├── stats: [rows=2.2e-08, distinct(2)=2.2e-08, null(2)=0] + │ │ │ histogram(2)= + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(4) + │ │ └── limit hint: 0.00 + │ └── filters + │ └── (y:3 = 'bar1000') OR (y:3 = 'bar1001') [type=bool, outer=(3), constraints=(/3: [/'bar1000' - /'bar1000'] [/'bar1001' - /'bar1001']; tight)] + └── 1 [type=int] + +# When optimizer_always_use_histograms is enabled, we should choose the unique +# index. +opt set=optimizer_always_use_histograms=true +SELECT * FROM stale WHERE x = 'bar1' AND (y = 'bar1000' OR y = 'bar1001') LIMIT 1 +---- +index-join stale + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + ├── cardinality: [0 - 1] + ├── stats: [rows=2.2e-08] + ├── key: () + ├── fd: ()-->(1-4) + └── scan stale@stale_x_y_key + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) + ├── constraint: /2/3 + │ ├── [/'bar1'/'bar1000' - /'bar1'/'bar1000'] + │ └── [/'bar1'/'bar1001' - /'bar1'/'bar1001'] + ├── limit: 1 + ├── stats: [rows=2.2e-08] + ├── key: () + └── fd: ()-->(1-3) diff --git a/pkg/sql/sessiondatapb/local_only_session_data.proto b/pkg/sql/sessiondatapb/local_only_session_data.proto index ea9062df8e41..082e3dfc8b1e 100644 --- a/pkg/sql/sessiondatapb/local_only_session_data.proto +++ b/pkg/sql/sessiondatapb/local_only_session_data.proto @@ -279,6 +279,9 @@ message LocalOnlySessionData { // CopyFromRetriesEnabled controls whether retries should be internally // attempted for retriable errors. bool copy_from_retries_enabled = 89; + // OptimizerAlwaysUseHistograms, when true, ensures that the optimizer + // always uses histograms to calculate statistics if available. + bool optimizer_always_use_histograms = 94; /////////////////////////////////////////////////////////////////////////// // WARNING: consider whether a session parameter you're adding needs to // diff --git a/pkg/sql/vars.go b/pkg/sql/vars.go index b973f4127346..326f9e2431f1 100644 --- a/pkg/sql/vars.go +++ b/pkg/sql/vars.go @@ -2272,6 +2272,23 @@ var varGen = map[string]sessionVar{ }, GlobalDefault: globalFalse, }, + + // CockroachDB extension. + `optimizer_always_use_histograms`: { + GetStringVal: makePostgresBoolGetStringValFn(`optimizer_always_use_histograms`), + Set: func(_ context.Context, m sessionDataMutator, s string) error { + b, err := paramparse.ParseBoolVar("optimizer_always_use_histograms", s) + if err != nil { + return err + } + m.SetOptimizerAlwaysUseHistograms(b) + return nil + }, + Get: func(evalCtx *extendedEvalContext) (string, error) { + return formatBoolAsPostgresSetting(evalCtx.SessionData().OptimizerAlwaysUseHistograms), nil + }, + GlobalDefault: globalFalse, + }, } const compatErrMsg = "this parameter is currently recognized only for compatibility and has no effect in CockroachDB."