Skip to content

Commit

Permalink
opt: add setting to always use histograms to calculate stats
Browse files Browse the repository at this point in the history
Informs cockroachdb#64570

Release note (sql change): Added a new session setting,
optimizer_always_use_histograms, which ensures that the optimizer
always uses histograms when available to calculate the statistics
of every plan that it explores. Enabling this setting can prevent
the optimizer from choosing a suboptimal index when statistics for
a table are stale.
  • Loading branch information
rytaft committed Mar 8, 2023
1 parent cbdd452 commit 19971b2
Show file tree
Hide file tree
Showing 10 changed files with 394 additions and 26 deletions.
4 changes: 4 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3298,6 +3298,10 @@ func (m *sessionDataMutator) SetOptimizerUseImprovedDisjunctionStats(val bool) {
m.data.OptimizerUseImprovedDisjunctionStats = val
}

func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) {
m.data.OptimizerAlwaysUseHistograms = val
}

// Utility functions related to scrubbing sensitive information on SQL Stats.

// quantizeCounts ensures that the Count field in the
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/information_schema
Original file line number Diff line number Diff line change
Expand Up @@ -4773,6 +4773,7 @@ null_ordered_last off
on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer on
optimizer_always_use_histograms off
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats off
optimizer_use_multicol_stats on
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/pg_catalog
Original file line number Diff line number Diff line change
Expand Up @@ -4204,6 +4204,7 @@ node_id 1 NULL
null_ordered_last off NULL NULL NULL string
on_update_rehome_row_enabled on NULL NULL NULL string
opt_split_scan_limit 2048 NULL NULL NULL string
optimizer_always_use_histograms off NULL NULL NULL string
optimizer_use_histograms on NULL NULL NULL string
optimizer_use_improved_disjunction_stats off NULL NULL NULL string
optimizer_use_multicol_stats on NULL NULL NULL string
Expand Down Expand Up @@ -4335,6 +4336,7 @@ node_id 1 NULL
null_ordered_last off NULL user NULL off off
on_update_rehome_row_enabled on NULL user NULL on on
opt_split_scan_limit 2048 NULL user NULL 2048 2048
optimizer_always_use_histograms off NULL user NULL off off
optimizer_use_histograms on NULL user NULL on on
optimizer_use_improved_disjunction_stats off NULL user NULL off off
optimizer_use_multicol_stats on NULL user NULL on on
Expand Down Expand Up @@ -4462,6 +4464,7 @@ null_ordered_last NULL NULL NULL
on_update_rehome_row_enabled NULL NULL NULL NULL NULL
opt_split_scan_limit NULL NULL NULL NULL NULL
optimizer NULL NULL NULL NULL NULL
optimizer_always_use_histograms NULL NULL NULL NULL NULL
optimizer_use_histograms NULL NULL NULL NULL NULL
optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL
optimizer_use_multicol_stats NULL NULL NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/show_source
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ node_id 1
null_ordered_last off
on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer_always_use_histograms off
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats off
optimizer_use_multicol_stats on
Expand Down
55 changes: 29 additions & 26 deletions pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ import (
// called groups where each group contains a set of logically equivalent
// expressions. Two expressions are considered logically equivalent if:
//
// 1. They return the same number and data type of columns. However, order and
// naming of columns doesn't matter.
// 2. They return the same number of rows, with the same values in each row.
// However, order of rows doesn't matter.
// 1. They return the same number and data type of columns. However, order and
// naming of columns doesn't matter.
// 2. They return the same number of rows, with the same values in each row.
// However, order of rows doesn't matter.
//
// The different expressions in a single group are called memo expressions
// (memo-ized expressions). The children of a memo expression can themselves be
Expand Down Expand Up @@ -72,17 +72,17 @@ import (
// in-memory instance. This allows interned expressions to be checked for
// equivalence by simple pointer comparison. For example:
//
// SELECT * FROM a, b WHERE a.x = b.x
// SELECT * FROM a, b WHERE a.x = b.x
//
// After insertion into the memo, the memo would contain these six groups, with
// numbers substituted for pointers to the normalized expression in each group:
//
// G6: [inner-join [G1 G2 G5]]
// G5: [eq [G3 G4]]
// G4: [variable b.x]
// G3: [variable a.x]
// G2: [scan b]
// G1: [scan a]
// G6: [inner-join [G1 G2 G5]]
// G5: [eq [G3 G4]]
// G4: [variable b.x]
// G3: [variable a.x]
// G2: [scan b]
// G1: [scan a]
//
// Each leaf expressions is interned by hashing its operator type and any
// private field values. Expressions higher in the tree can then rely on the
Expand All @@ -98,12 +98,12 @@ import (
// added by the factory. For example, the join commutativity transformation
// expands the memo like this:
//
// G6: [inner-join [G1 G2 G5]] [inner-join [G2 G1 G5]]
// G5: [eq [G3 G4]]
// G4: [variable b.x]
// G3: [variable a.x]
// G2: [scan b]
// G1: [scan a]
// G6: [inner-join [G1 G2 G5]] [inner-join [G2 G1 G5]]
// G5: [eq [G3 G4]]
// G4: [variable b.x]
// G3: [variable a.x]
// G2: [scan b]
// G1: [scan a]
//
// See the comments in explorer.go for more details.
type Memo struct {
Expand Down Expand Up @@ -156,6 +156,7 @@ type Memo struct {
testingOptimizerCostPerturbation float64
testingOptimizerDisableRuleProbability float64
useImprovedDisjunctionStats bool
alwaysUseHistograms bool

// curRank is the highest currently in-use scalar expression rank.
curRank opt.ScalarRank
Expand Down Expand Up @@ -207,6 +208,7 @@ func (m *Memo) Init(evalCtx *tree.EvalContext) {
testingOptimizerCostPerturbation: evalCtx.SessionData().TestingOptimizerCostPerturbation,
testingOptimizerDisableRuleProbability: evalCtx.SessionData().TestingOptimizerDisableRuleProbability,
useImprovedDisjunctionStats: evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats,
alwaysUseHistograms: evalCtx.SessionData().OptimizerAlwaysUseHistograms,
}
m.metadata.Init()
m.logPropsBuilder.init(evalCtx, m)
Expand Down Expand Up @@ -298,14 +300,14 @@ func (m *Memo) HasPlaceholders() bool {
// that takes into account the changes. IsStale checks the following
// dependencies:
//
// 1. Current database: this can change name resolution.
// 2. Current search path: this can change name resolution.
// 3. Current location: this determines time zone, and can change how time-
// related types are constructed and compared.
// 4. Data source schema: this determines most aspects of how the query is
// compiled.
// 5. Data source privileges: current user may no longer have access to one or
// more data sources.
// 1. Current database: this can change name resolution.
// 2. Current search path: this can change name resolution.
// 3. Current location: this determines time zone, and can change how time-
// related types are constructed and compared.
// 4. Data source schema: this determines most aspects of how the query is
// compiled.
// 5. Data source privileges: current user may no longer have access to one or
// more data sources.
//
// This function cannot swallow errors and return only a boolean, as it may
// perform KV operations on behalf of the transaction associated with the
Expand Down Expand Up @@ -335,7 +337,8 @@ func (m *Memo) IsStale(
m.testingOptimizerRandomSeed != evalCtx.SessionData().TestingOptimizerRandomSeed ||
m.testingOptimizerCostPerturbation != evalCtx.SessionData().TestingOptimizerCostPerturbation ||
m.testingOptimizerDisableRuleProbability != evalCtx.SessionData().TestingOptimizerDisableRuleProbability ||
m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats {
m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats ||
m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms {
return true, nil
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/opt/memo/memo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,12 @@ func TestMemoIsStale(t *testing.T) {
evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats = false
notStale()

// Stale optimizer_always_use_histograms.
evalCtx.SessionData().OptimizerAlwaysUseHistograms = true
stale()
evalCtx.SessionData().OptimizerAlwaysUseHistograms = false
notStale()

// Stale data sources and schema. Create new catalog so that data sources are
// recreated and can be modified independently.
catalog = testcat.New()
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2740,6 +2740,9 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
}

func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
if sb.evalCtx.SessionData().OptimizerAlwaysUseHistograms {
return true
}
// If we know that the cardinality is below a certain threshold (e.g., due to
// a constraint on a key column), don't bother adding the overhead of
// creating a histogram.
Expand Down
Loading

0 comments on commit 19971b2

Please sign in to comment.