From 5826f28bea10d41a776c51d946a5643fe5087ad8 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Tue, 7 Mar 2023 19:58:27 -0600 Subject: [PATCH] opt: add setting to always use histograms to calculate stats Informs #64570 Release note (sql change): Added a new session setting, optimizer_always_use_histograms, which ensures that the optimizer always uses histograms when available to calculate the statistics of every plan that it explores. Enabling this setting can prevent the optimizer from choosing a suboptimal index when statistics for a table are stale. --- pkg/sql/exec_util.go | 4 + .../testdata/logic_test/information_schema | 1 + .../logictest/testdata/logic_test/pg_catalog | 3 + .../logictest/testdata/logic_test/show_source | 1 + pkg/sql/opt/memo/memo.go | 5 +- pkg/sql/opt/memo/memo_test.go | 6 + pkg/sql/opt/memo/statistics_builder.go | 3 + pkg/sql/opt/memo/testdata/stats/scan | 327 ++++++++++++++++++ .../local_only_session_data.proto | 3 + pkg/sql/vars.go | 17 + 10 files changed, 369 insertions(+), 1 deletion(-) diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 1ee1786cded7..201e156a779f 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -3391,6 +3391,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedSplitDisjunctionForJoins(val m.data.OptimizerUseImprovedSplitDisjunctionForJoins = val } +func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) { + m.data.OptimizerAlwaysUseHistograms = val +} + // Utility functions related to scrubbing sensitive information on SQL Stats. // quantizeCounts ensures that the Count field in the diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema index 10b0105300d6..65351c012753 100644 --- a/pkg/sql/logictest/testdata/logic_test/information_schema +++ b/pkg/sql/logictest/testdata/logic_test/information_schema @@ -4773,6 +4773,7 @@ null_ordered_last off on_update_rehome_row_enabled on opt_split_scan_limit 2048 optimizer on +optimizer_always_use_histograms off optimizer_use_forecasts on optimizer_use_histograms on optimizer_use_improved_disjunction_stats off diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog index 77b1e6e8f3e6..3f0a4b1a5da6 100644 --- a/pkg/sql/logictest/testdata/logic_test/pg_catalog +++ b/pkg/sql/logictest/testdata/logic_test/pg_catalog @@ -2806,6 +2806,7 @@ node_id 1 NULL null_ordered_last off NULL NULL NULL string on_update_rehome_row_enabled on NULL NULL NULL string opt_split_scan_limit 2048 NULL NULL NULL string +optimizer_always_use_histograms off NULL NULL NULL string optimizer_use_forecasts on NULL NULL NULL string optimizer_use_histograms on NULL NULL NULL string optimizer_use_improved_disjunction_stats off NULL NULL NULL string @@ -2947,6 +2948,7 @@ node_id 1 NULL null_ordered_last off NULL user NULL off off on_update_rehome_row_enabled on NULL user NULL on on opt_split_scan_limit 2048 NULL user NULL 2048 2048 +optimizer_always_use_histograms off NULL user NULL off off optimizer_use_forecasts on NULL user NULL on on optimizer_use_histograms on NULL user NULL on on optimizer_use_improved_disjunction_stats off NULL user NULL off off @@ -3086,6 +3088,7 @@ null_ordered_last NULL NULL NULL on_update_rehome_row_enabled NULL NULL NULL NULL NULL opt_split_scan_limit NULL NULL NULL NULL NULL optimizer NULL NULL NULL NULL NULL +optimizer_always_use_histograms NULL NULL NULL NULL NULL optimizer_use_forecasts NULL NULL NULL NULL NULL optimizer_use_histograms NULL NULL NULL NULL NULL optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source index b1e7c3232324..8e57879ed891 100644 --- a/pkg/sql/logictest/testdata/logic_test/show_source +++ b/pkg/sql/logictest/testdata/logic_test/show_source @@ -105,6 +105,7 @@ node_id 1 null_ordered_last off on_update_rehome_row_enabled on opt_split_scan_limit 2048 +optimizer_always_use_histograms off optimizer_use_forecasts on optimizer_use_histograms on optimizer_use_improved_disjunction_stats off diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go index a528fbaee4c6..63e6e9d0c412 100644 --- a/pkg/sql/opt/memo/memo.go +++ b/pkg/sql/opt/memo/memo.go @@ -161,6 +161,7 @@ type Memo struct { useImprovedDisjunctionStats bool useLimitOrderingForStreamingGroupBy bool useImprovedSplitDisjunctionForJoins bool + alwaysUseHistograms bool // curRank is the highest currently in-use scalar expression rank. curRank opt.ScalarRank @@ -217,6 +218,7 @@ func (m *Memo) Init(evalCtx *eval.Context) { useImprovedDisjunctionStats: evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats, useLimitOrderingForStreamingGroupBy: evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy, useImprovedSplitDisjunctionForJoins: evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins, + alwaysUseHistograms: evalCtx.SessionData().OptimizerAlwaysUseHistograms, } m.metadata.Init() m.logPropsBuilder.init(evalCtx, m) @@ -356,7 +358,8 @@ func (m *Memo) IsStale( m.variableInequalityLookupJoinEnabled != evalCtx.SessionData().VariableInequalityLookupJoinEnabled || m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats || m.useLimitOrderingForStreamingGroupBy != evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy || - m.useImprovedSplitDisjunctionForJoins != evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins { + m.useImprovedSplitDisjunctionForJoins != evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins || + m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms { return true, nil } diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go index 4b56d66dd6ef..c5e38a84b614 100644 --- a/pkg/sql/opt/memo/memo_test.go +++ b/pkg/sql/opt/memo/memo_test.go @@ -334,6 +334,12 @@ func TestMemoIsStale(t *testing.T) { evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats = false notStale() + // Stale optimizer_always_use_histograms. + evalCtx.SessionData().OptimizerAlwaysUseHistograms = true + stale() + evalCtx.SessionData().OptimizerAlwaysUseHistograms = false + notStale() + // Stale data sources and schema. Create new catalog so that data sources are // recreated and can be modified independently. catalog = testcat.New() diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index fd94f13cefa3..c227f2df01dd 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -2808,6 +2808,9 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts( } func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool { + if sb.evalCtx.SessionData().OptimizerAlwaysUseHistograms { + return true + } // If we know that the cardinality is below a certain threshold (e.g., due to // a constraint on a key column), don't bother adding the overhead of // creating a histogram. diff --git a/pkg/sql/opt/memo/testdata/stats/scan b/pkg/sql/opt/memo/testdata/stats/scan index 14c153131f93..b46abc7add2e 100644 --- a/pkg/sql/opt/memo/testdata/stats/scan +++ b/pkg/sql/opt/memo/testdata/stats/scan @@ -3139,3 +3139,330 @@ project │ └── fd: (1)-->(2,3) └── filters └── x:1 < 10 [type=bool, outer=(1), constraints=(/1: (/NULL - /9]; tight)] + +# Tests for when stats are stale. + +exec-ddl +CREATE TABLE stale ( + w STRING PRIMARY KEY, + x STRING, + y STRING, + z STRING, + UNIQUE (x, y), + INDEX (x, z) +) +---- + +exec-ddl +ALTER TABLE stale INJECT STATISTICS '[ + { + "avg_size": 7, + "columns": [ + "x" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo1" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo10" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo2" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo3" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo4" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo5" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo6" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo7" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo8" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "foo9" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 16, + "columns": [ + "x", + "y" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 110, + "histo_col_type": "", + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 13, + "columns": [ + "x", + "z" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_col_type": "", + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 9, + "columns": [ + "y" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 11, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1000" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1001" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1002" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1003" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1004" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1005" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1006" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1007" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1008" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1009" + }, + { + "distinct_range": 0, + "num_eq": 10, + "num_range": 0, + "upper_bound": "bar1010" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + }, + { + "avg_size": 7, + "columns": [ + "z" + ], + "created_at": "2023-03-08 01:51:41.258198", + "distinct_count": 10, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz1" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz10" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz2" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz3" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz4" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz5" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz6" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz7" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz8" + }, + { + "distinct_range": 0, + "num_eq": 11, + "num_range": 0, + "upper_bound": "baz9" + } + ], + "histo_col_type": "STRING", + "histo_version": 2, + "null_count": 0, + "row_count": 110 + } +]' +---- + +# When optimizer_always_use_histograms is disabled, we may choose the non-unique +# index. +opt set=optimizer_always_use_histograms=false +SELECT * FROM stale WHERE x = 'bar1' AND (y = 'bar1000' OR y = 'bar1001') LIMIT 1 +---- +limit + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + ├── cardinality: [0 - 1] + ├── stats: [rows=1] + ├── key: () + ├── fd: ()-->(1-4) + ├── select + │ ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + │ ├── cardinality: [0 - 2] + │ ├── stats: [rows=2, distinct(2)=1, null(2)=0, distinct(3)=2, null(3)=0, distinct(2,3)=2, null(2,3)=0] + │ ├── key: (1) + │ ├── fd: ()-->(2), (1)-->(3,4), (3)-->(1,4) + │ ├── limit hint: 1.00 + │ ├── index-join stale + │ │ ├── columns: w:1(string!null) x:2(string) y:3(string) z:4(string) + │ │ ├── stats: [rows=2.2e-08] + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(3,4), (2,3)~~>(1,4) + │ │ ├── limit hint: 0.00 + │ │ └── scan stale@stale_x_z_idx + │ │ ├── columns: w:1(string!null) x:2(string!null) z:4(string) + │ │ ├── constraint: /2/4/1: [/'bar1' - /'bar1'] + │ │ ├── stats: [rows=2.2e-08, distinct(2)=2.2e-08, null(2)=0] + │ │ │ histogram(2)= + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(4) + │ │ └── limit hint: 0.00 + │ └── filters + │ └── (y:3 = 'bar1000') OR (y:3 = 'bar1001') [type=bool, outer=(3), constraints=(/3: [/'bar1000' - /'bar1000'] [/'bar1001' - /'bar1001']; tight)] + └── 1 [type=int] + +# When optimizer_always_use_histograms is enabled, we should choose the unique +# index. +opt set=optimizer_always_use_histograms=true +SELECT * FROM stale WHERE x = 'bar1' AND (y = 'bar1000' OR y = 'bar1001') LIMIT 1 +---- +index-join stale + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) z:4(string) + ├── cardinality: [0 - 1] + ├── stats: [rows=2.2e-08] + ├── key: () + ├── fd: ()-->(1-4) + └── scan stale@stale_x_y_key + ├── columns: w:1(string!null) x:2(string!null) y:3(string!null) + ├── constraint: /2/3 + │ ├── [/'bar1'/'bar1000' - /'bar1'/'bar1000'] + │ └── [/'bar1'/'bar1001' - /'bar1'/'bar1001'] + ├── limit: 1 + ├── stats: [rows=2.2e-08] + ├── key: () + └── fd: ()-->(1-3) diff --git a/pkg/sql/sessiondatapb/local_only_session_data.proto b/pkg/sql/sessiondatapb/local_only_session_data.proto index be1a0206c33a..f2578d5a67c1 100644 --- a/pkg/sql/sessiondatapb/local_only_session_data.proto +++ b/pkg/sql/sessiondatapb/local_only_session_data.proto @@ -318,6 +318,9 @@ message LocalOnlySessionData { // inner, semi, and anti joins will be split. If false, only disjunctions // potentially containing an equijoin condition will be split. bool optimizer_use_improved_split_disjunction_for_joins = 91; + // OptimizerAlwaysUseHistograms, when true, ensures that the optimizer + // always uses histograms to calculate statistics if available. + bool optimizer_always_use_histograms = 94; /////////////////////////////////////////////////////////////////////////// // WARNING: consider whether a session parameter you're adding needs to // diff --git a/pkg/sql/vars.go b/pkg/sql/vars.go index 7544a0385f78..1bebf12164c5 100644 --- a/pkg/sql/vars.go +++ b/pkg/sql/vars.go @@ -2399,6 +2399,23 @@ var varGen = map[string]sessionVar{ }, GlobalDefault: globalFalse, }, + + // CockroachDB extension. + `optimizer_always_use_histograms`: { + GetStringVal: makePostgresBoolGetStringValFn(`optimizer_always_use_histograms`), + Set: func(_ context.Context, m sessionDataMutator, s string) error { + b, err := paramparse.ParseBoolVar("optimizer_always_use_histograms", s) + if err != nil { + return err + } + m.SetOptimizerAlwaysUseHistograms(b) + return nil + }, + Get: func(evalCtx *extendedEvalContext, _ *kv.Txn) (string, error) { + return formatBoolAsPostgresSetting(evalCtx.SessionData().OptimizerAlwaysUseHistograms), nil + }, + GlobalDefault: globalFalse, + }, } // We want test coverage for this on and off so make it metamorphic.