Skip to content

Commit

Permalink
Merge #122442
Browse files Browse the repository at this point in the history
122442: opt: consider limit hint when costing distinct-on expressions r=mgartner a=mgartner

#### opt: add coster tests for disctinct-on expressions

This commit adds coster tests for distinct-on expressions. There were no
such tests before.

Release note: None

#### opt: consider limit hint when costing distinct-on expressions

Limit hints are now considered when costing DistinctOn expressions,
similarly to streaming GroupBy expressions. This matches the behavior of
the execution logic, where a DistinctOn emits a row with distinct
grouping columns as soon as it is read from its input. It does not need
to read all of its input to begin emitting rows.

Release note (performance improvement): The optimizer now costs
distinct-on operators more accurately. It may produce more efficient
query plans in some cases.

Epic: CRDB-37714

#### opt: add optimizer_use_improved_distinct_on_limit_hint_costing

The `optimizer_use_improved_distinct_on_limit_hint_costing` session
setting has been added which enables the improved costing of DistinctOn
expressions with limit hints added in the previous commit. It is enabled
by default.

Release note: None


Co-authored-by: Marcus Gartner <[email protected]>
  • Loading branch information
craig[bot] and mgartner committed Apr 18, 2024
2 parents 859b6b1 + af62526 commit d17ebfe
Show file tree
Hide file tree
Showing 15 changed files with 179 additions and 32 deletions.
4 changes: 4 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3753,6 +3753,10 @@ func (m *sessionDataMutator) SetOptimizerUseTrigramSimilarityOptimization(val bo
m.data.OptimizerUseTrigramSimilarityOptimization = val
}

func (m *sessionDataMutator) SetOptimizerUseImprovedDistinctOnLimitHintCosting(val bool) {
m.data.OptimizerUseImprovedDistinctOnLimitHintCosting = val
}

// Utility functions related to scrubbing sensitive information on SQL Stats.

// quantizeCounts ensures that the Count field in the
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/information_schema
Original file line number Diff line number Diff line change
Expand Up @@ -6166,6 +6166,7 @@ optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_computed_column_filters_derivation on
optimizer_use_improved_disjunction_stats on
optimizer_use_improved_distinct_on_limit_hint_costing on
optimizer_use_improved_join_elimination on
optimizer_use_improved_split_disjunction_for_joins on
optimizer_use_limit_ordering_for_streaming_group_by on
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/pg_catalog
Original file line number Diff line number Diff line change
Expand Up @@ -2892,6 +2892,7 @@ optimizer_use_forecasts on N
optimizer_use_histograms on NULL NULL NULL string
optimizer_use_improved_computed_column_filters_derivation on NULL NULL NULL string
optimizer_use_improved_disjunction_stats on NULL NULL NULL string
optimizer_use_improved_distinct_on_limit_hint_costing on NULL NULL NULL string
optimizer_use_improved_join_elimination on NULL NULL NULL string
optimizer_use_improved_split_disjunction_for_joins on NULL NULL NULL string
optimizer_use_limit_ordering_for_streaming_group_by on NULL NULL NULL string
Expand Down Expand Up @@ -3071,6 +3072,7 @@ optimizer_use_forecasts on N
optimizer_use_histograms on NULL user NULL on on
optimizer_use_improved_computed_column_filters_derivation on NULL user NULL on on
optimizer_use_improved_disjunction_stats on NULL user NULL on on
optimizer_use_improved_distinct_on_limit_hint_costing on NULL user NULL on on
optimizer_use_improved_join_elimination on NULL user NULL on on
optimizer_use_improved_split_disjunction_for_joins on NULL user NULL on on
optimizer_use_limit_ordering_for_streaming_group_by on NULL user NULL on on
Expand Down Expand Up @@ -3249,6 +3251,7 @@ optimizer_use_forecasts NULL NULL NULL
optimizer_use_histograms NULL NULL NULL NULL NULL
optimizer_use_improved_computed_column_filters_derivation NULL NULL NULL NULL NULL
optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL
optimizer_use_improved_distinct_on_limit_hint_costing NULL NULL NULL NULL NULL
optimizer_use_improved_join_elimination NULL NULL NULL NULL NULL
optimizer_use_improved_split_disjunction_for_joins NULL NULL NULL NULL NULL
optimizer_use_limit_ordering_for_streaming_group_by NULL NULL NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/show_source
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_computed_column_filters_derivation on
optimizer_use_improved_disjunction_stats on
optimizer_use_improved_distinct_on_limit_hint_costing on
optimizer_use_improved_join_elimination on
optimizer_use_improved_split_disjunction_for_joins on
optimizer_use_limit_ordering_for_streaming_group_by on
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/opt/bench/bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ func newHarness(tb testing.TB, query benchQuery, schemas []string) *harness {
h.evalCtx.SessionData().VariableInequalityLookupJoinEnabled = true
h.evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats = true
h.evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = true
h.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true

// Set up the test catalog.
h.testCat = testcat.New()
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ type Memo struct {
plpgsqlUseStrictInto bool
useVirtualComputedColumnStats bool
useTrigramSimilarityOptimization bool
useImprovedDistinctOnLimitHintCosting bool
trigramSimilarityThreshold float64
splitScanLimit int32

Expand Down Expand Up @@ -268,6 +269,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
plpgsqlUseStrictInto: evalCtx.SessionData().PLpgSQLUseStrictInto,
useVirtualComputedColumnStats: evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats,
useTrigramSimilarityOptimization: evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization,
useImprovedDistinctOnLimitHintCosting: evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting,
trigramSimilarityThreshold: evalCtx.SessionData().TrigramSimilarityThreshold,
splitScanLimit: evalCtx.SessionData().OptSplitScanLimit,
txnIsoLevel: evalCtx.TxnIsoLevel,
Expand Down Expand Up @@ -424,6 +426,7 @@ func (m *Memo) IsStale(
m.plpgsqlUseStrictInto != evalCtx.SessionData().PLpgSQLUseStrictInto ||
m.useVirtualComputedColumnStats != evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats ||
m.useTrigramSimilarityOptimization != evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization ||
m.useImprovedDistinctOnLimitHintCosting != evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting ||
m.trigramSimilarityThreshold != evalCtx.SessionData().TrigramSimilarityThreshold ||
m.splitScanLimit != evalCtx.SessionData().OptSplitScanLimit ||
m.txnIsoLevel != evalCtx.TxnIsoLevel {
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/opt/memo/memo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,12 @@ func TestMemoIsStale(t *testing.T) {
evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = false
notStale()

// Stale optimizer_use_distinct_on_limit_hint_costing.
evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true
stale()
evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = false
notStale()

// Stale pg_trgm.similarity_threshold.
evalCtx.SessionData().TrigramSimilarityThreshold = 0.5
stale()
Expand Down
18 changes: 4 additions & 14 deletions pkg/sql/opt/memo/testdata/stats/select
Original file line number Diff line number Diff line change
Expand Up @@ -2573,7 +2573,6 @@ limit
├── distinct-on
│ ├── columns: k:1(int!null) a:2(string) b:3(string) c:4(string)
│ ├── grouping columns: k:1(int!null)
│ ├── internal-ordering: +1
│ ├── stats: [rows=1.9995]
│ ├── key: (1)
│ ├── fd: (1)-->(2-4)
Expand All @@ -2583,33 +2582,28 @@ limit
│ │ ├── left columns: k:7(int) a:8(string) b:9(string) c:10(string)
│ │ ├── right columns: k:13(int) a:14(string) b:15(string) c:16(string)
│ │ ├── stats: [rows=2]
│ │ ├── ordering: +1
│ │ ├── index-join disjunction
│ │ │ ├── columns: k:7(int!null) a:8(string!null) b:9(string) c:10(string)
│ │ │ ├── stats: [rows=1, distinct(8)=1, null(8)=0]
│ │ │ ├── key: (7)
│ │ │ ├── fd: ()-->(8), (7)-->(9,10)
│ │ │ ├── ordering: +7 opt(8) [actual: +7]
│ │ │ └── scan disjunction@a_idx
│ │ │ ├── columns: k:7(int!null) a:8(string!null)
│ │ │ ├── constraint: /8/7: [/'foo' - /'foo']
│ │ │ ├── stats: [rows=1, distinct(8)=1, null(8)=0]
│ │ │ ├── key: (7)
│ │ │ ├── fd: ()-->(8)
│ │ │ └── ordering: +7 opt(8) [actual: +7]
│ │ │ └── fd: ()-->(8)
│ │ └── index-join disjunction
│ │ ├── columns: k:13(int!null) a:14(string) b:15(string!null) c:16(string)
│ │ ├── stats: [rows=1, distinct(15)=1, null(15)=0]
│ │ ├── key: (13)
│ │ ├── fd: ()-->(15), (13)-->(14,16)
│ │ ├── ordering: +13 opt(15) [actual: +13]
│ │ └── scan disjunction@b_idx
│ │ ├── columns: k:13(int!null) b:15(string!null)
│ │ ├── constraint: /15/13: [/'foo' - /'foo']
│ │ ├── stats: [rows=1, distinct(15)=1, null(15)=0]
│ │ ├── key: (13)
│ │ ├── fd: ()-->(15)
│ │ └── ordering: +13 opt(15) [actual: +13]
│ │ └── fd: ()-->(15)
│ └── aggregations
│ ├── const-agg [as=a:2, type=string, outer=(2)]
│ │ └── a:2 [type=string]
Expand All @@ -2632,7 +2626,6 @@ limit
├── distinct-on
│ ├── columns: k:1(int!null) a:2(string) b:3(string) c:4(string)
│ ├── grouping columns: k:1(int!null)
│ ├── internal-ordering: +1
│ ├── stats: [rows=2.998501]
│ ├── key: (1)
│ ├── fd: (1)-->(2-4)
Expand All @@ -2642,27 +2635,24 @@ limit
│ │ ├── left columns: k:7(int) a:8(string) b:9(string) c:10(string)
│ │ ├── right columns: k:13(int) a:14(string) b:15(string) c:16(string)
│ │ ├── stats: [rows=2.999501]
│ │ ├── ordering: +1
│ │ ├── index-join disjunction
│ │ │ ├── columns: k:7(int!null) a:8(string!null) b:9(string) c:10(string)
│ │ │ ├── stats: [rows=1, distinct(8)=1, null(8)=0]
│ │ │ ├── key: (7)
│ │ │ ├── fd: ()-->(8), (7)-->(9,10)
│ │ │ ├── ordering: +7 opt(8) [actual: +7]
│ │ │ └── scan disjunction@a_idx
│ │ │ ├── columns: k:7(int!null) a:8(string!null)
│ │ │ ├── constraint: /8/7: [/'foo' - /'foo']
│ │ │ ├── stats: [rows=1, distinct(8)=1, null(8)=0]
│ │ │ ├── key: (7)
│ │ │ ├── fd: ()-->(8)
│ │ │ └── ordering: +7 opt(8) [actual: +7]
│ │ │ └── fd: ()-->(8)
│ │ └── distinct-on
│ │ ├── columns: k:13(int!null) a:14(string) b:15(string) c:16(string)
│ │ ├── grouping columns: k:13(int!null)
│ │ ├── internal-ordering: +13
│ │ ├── stats: [rows=1.9995]
│ │ ├── key: (13)
│ │ ├── fd: (13)-->(14-16)
│ │ ├── ordering: +13
│ │ ├── union-all
│ │ │ ├── columns: k:13(int!null) a:14(string) b:15(string) c:16(string)
│ │ │ ├── left columns: k:19(int) a:20(string) b:21(string) c:22(string)
Expand Down
18 changes: 10 additions & 8 deletions pkg/sql/opt/norm/testdata/rules/limit
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,16 @@ project
│ ├── cardinality: [0 - 5]
│ ├── key: (2,3)
│ ├── ordering: +3
│ ├── sort
│ ├── distinct-on
│ │ ├── columns: i:2 f:3
│ │ ├── grouping columns: i:2 f:3
│ │ ├── key: (2,3)
│ │ ├── ordering: +3
│ │ ├── limit hint: 5.00
│ │ └── distinct-on
│ │ └── sort
│ │ ├── columns: i:2 f:3
│ │ ├── grouping columns: i:2 f:3
│ │ ├── key: (2,3)
│ │ ├── ordering: +3
│ │ ├── limit hint: 6.02
│ │ └── scan a
│ │ └── columns: i:2 f:3
│ └── 5
Expand Down Expand Up @@ -449,15 +450,16 @@ project
│ │ ├── cardinality: [0 - 15]
│ │ ├── key: (2,3)
│ │ ├── ordering: +3
│ │ ├── sort
│ │ ├── distinct-on
│ │ │ ├── columns: i:2 f:3
│ │ │ ├── grouping columns: i:2 f:3
│ │ │ ├── key: (2,3)
│ │ │ ├── ordering: +3
│ │ │ ├── limit hint: 15.00
│ │ │ └── distinct-on
│ │ │ └── sort
│ │ │ ├── columns: i:2 f:3
│ │ │ ├── grouping columns: i:2 f:3
│ │ │ ├── key: (2,3)
│ │ │ ├── ordering: +3
│ │ │ ├── limit hint: 18.16
│ │ │ └── scan a
│ │ │ └── columns: i:2 f:3
│ │ └── 15
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/opt/testutils/opttester/opt_tester.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ func New(catalog cat.Catalog, sql string) *OptTester {
ot.evalCtx.SessionData().OptimizerMergeJoinsEnabled = true
ot.evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats = true
ot.evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = true
ot.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true
ot.evalCtx.SessionData().TrigramSimilarityThreshold = 0.3

return ot
Expand Down
16 changes: 11 additions & 5 deletions pkg/sql/opt/xform/coster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1405,12 +1405,18 @@ func (c *coster) computeGroupingCost(grouping memo.RelExpr, required *physical.R
// Normally, a grouping expression must process each input row once.
inputRowCount := grouping.Child(0).(memo.RelExpr).Relational().Statistics().RowCount

// If this is a streaming GroupBy with a limit hint, l, we only need to
// process enough input rows to output l rows.
// If this is a streaming GroupBy or a DistinctOn with a limit hint, l, we
// only need to process enough input rows to output l rows.
streamingType := private.GroupingOrderType(&required.Ordering)
if (streamingType != memo.NoStreaming) && grouping.Op() == opt.GroupByOp && required.LimitHint > 0 {
inputRowCount = streamingGroupByInputLimitHint(inputRowCount, outputRowCount, required.LimitHint)
outputRowCount = math.Min(outputRowCount, required.LimitHint)
if required.LimitHint > 0 {
if grouping.Op() == opt.GroupByOp && streamingType != memo.NoStreaming {
inputRowCount = streamingGroupByInputLimitHint(inputRowCount, outputRowCount, required.LimitHint)
outputRowCount = math.Min(outputRowCount, required.LimitHint)
} else if grouping.Op() == opt.DistinctOnOp &&
c.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting {
inputRowCount = distinctOnLimitHint(outputRowCount, required.LimitHint)
outputRowCount = math.Min(outputRowCount, required.LimitHint)
}
}

// Cost per row depends on the number of grouping columns and the number of
Expand Down
107 changes: 107 additions & 0 deletions pkg/sql/opt/xform/testdata/coster/groupby
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,113 @@ limit
│ └── count-rows [as=count_rows:7]
└── 10

opt
SELECT DISTINCT a FROM b
----
distinct-on
├── columns: a:2
├── grouping columns: a:2
├── internal-ordering: +2
├── stats: [rows=100, distinct(2)=100, null(2)=1]
├── cost: 1079.45
├── key: (2)
└── scan b@b_a_b_idx
├── columns: a:2
├── stats: [rows=1000, distinct(2)=100, null(2)=10]
├── cost: 1068.42
└── ordering: +2

opt
SELECT DISTINCT a FROM b LIMIT 10
----
limit
├── columns: a:2
├── cardinality: [0 - 10]
├── stats: [rows=10]
├── cost: 32.5488337
├── key: (2)
├── distinct-on
│ ├── columns: a:2
│ ├── grouping columns: a:2
│ ├── internal-ordering: +2
│ ├── stats: [rows=100, distinct(2)=100, null(2)=1]
│ ├── cost: 32.4388337
│ ├── key: (2)
│ ├── limit hint: 10.00
│ └── scan b@b_a_b_idx
│ ├── columns: a:2
│ ├── stats: [rows=1000, distinct(2)=100, null(2)=10]
│ ├── cost: 31.281321
│ ├── ordering: +2
│ └── limit hint: 12.75
└── 10

opt set=(optimizer_use_improved_distinct_on_limit_hint_costing=false)
SELECT DISTINCT a FROM b LIMIT 10
----
limit
├── columns: a:2
├── cardinality: [0 - 10]
├── stats: [rows=10]
├── cost: 42.421321
├── key: (2)
├── distinct-on
│ ├── columns: a:2
│ ├── grouping columns: a:2
│ ├── internal-ordering: +2
│ ├── stats: [rows=100, distinct(2)=100, null(2)=1]
│ ├── cost: 42.311321
│ ├── key: (2)
│ ├── limit hint: 10.00
│ └── scan b@b_a_b_idx
│ ├── columns: a:2
│ ├── stats: [rows=1000, distinct(2)=100, null(2)=10]
│ ├── cost: 31.281321
│ ├── ordering: +2
│ └── limit hint: 12.75
└── 10

opt
SELECT DISTINCT a, b FROM b
----
distinct-on
├── columns: a:2 b:3
├── grouping columns: a:2 b:3
├── internal-ordering: +2,+3
├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
├── cost: 1108.55
├── key: (2,3)
└── scan b@b_a_b_idx
├── columns: a:2 b:3
├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
├── cost: 1078.52
└── ordering: +2,+3

opt
SELECT DISTINCT a, b FROM b LIMIT 10
----
limit
├── columns: a:2 b:3
├── cardinality: [0 - 10]
├── stats: [rows=10]
├── cost: 41.0744319
├── key: (2,3)
├── distinct-on
│ ├── columns: a:2 b:3
│ ├── grouping columns: a:2 b:3
│ ├── internal-ordering: +2,+3
│ ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
│ ├── cost: 40.9644319
│ ├── key: (2,3)
│ ├── limit hint: 10.00
│ └── scan b@b_a_b_idx
│ ├── columns: a:2 b:3
│ ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
│ ├── cost: 30.6930407
│ ├── ordering: +2,+3
│ └── limit hint: 12.07
└── 10

# Partially ordered group by with a limit hint.
opt
SELECT a, c, count(*) FROM c GROUP BY a, c LIMIT 10
Expand Down
Loading

0 comments on commit d17ebfe

Please sign in to comment.