Merge #122442

122442: opt: consider limit hint when costing distinct-on expressions r=mgartner a=mgartner #### opt: add coster tests for disctinct-on expressions This commit adds coster tests for distinct-on expressions. There were no such tests before. Release note: None #### opt: consider limit hint when costing distinct-on expressions Limit hints are now considered when costing DistinctOn expressions, similarly to streaming GroupBy expressions. This matches the behavior of the execution logic, where a DistinctOn emits a row with distinct grouping columns as soon as it is read from its input. It does not need to read all of its input to begin emitting rows. Release note (performance improvement): The optimizer now costs distinct-on operators more accurately. It may produce more efficient query plans in some cases. Epic: CRDB-37714 #### opt: add optimizer_use_improved_distinct_on_limit_hint_costing The `optimizer_use_improved_distinct_on_limit_hint_costing` session setting has been added which enables the improved costing of DistinctOn expressions with limit hints added in the previous commit. It is enabled by default. Release note: None Co-authored-by: Marcus Gartner <[email protected]>
cockroachdb · Apr 18, 2024 · d17ebfe · d17ebfe
2 parents 859b6b1 + af62526
commit d17ebfe
Show file tree

Hide file tree

Showing 15 changed files with 179 additions and 32 deletions.
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -3753,6 +3753,10 @@ func (m *sessionDataMutator) SetOptimizerUseTrigramSimilarityOptimization(val bo
 	m.data.OptimizerUseTrigramSimilarityOptimization = val
 }
 
+func (m *sessionDataMutator) SetOptimizerUseImprovedDistinctOnLimitHintCosting(val bool) {
+	m.data.OptimizerUseImprovedDistinctOnLimitHintCosting = val
+}
+
 // Utility functions related to scrubbing sensitive information on SQL Stats.
 
 // quantizeCounts ensures that the Count field in the

diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema
@@ -6166,6 +6166,7 @@ optimizer_use_forecasts                                    on
 optimizer_use_histograms                                   on
 optimizer_use_improved_computed_column_filters_derivation  on
 optimizer_use_improved_disjunction_stats                   on
+optimizer_use_improved_distinct_on_limit_hint_costing      on
 optimizer_use_improved_join_elimination                    on
 optimizer_use_improved_split_disjunction_for_joins         on
 optimizer_use_limit_ordering_for_streaming_group_by        on

diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog
@@ -2892,6 +2892,7 @@ optimizer_use_forecasts                                    on                  N
 optimizer_use_histograms                                   on                  NULL      NULL        NULL        string
 optimizer_use_improved_computed_column_filters_derivation  on                  NULL      NULL        NULL        string
 optimizer_use_improved_disjunction_stats                   on                  NULL      NULL        NULL        string
+optimizer_use_improved_distinct_on_limit_hint_costing      on                  NULL      NULL        NULL        string
 optimizer_use_improved_join_elimination                    on                  NULL      NULL        NULL        string
 optimizer_use_improved_split_disjunction_for_joins         on                  NULL      NULL        NULL        string
 optimizer_use_limit_ordering_for_streaming_group_by        on                  NULL      NULL        NULL        string
@@ -3071,6 +3072,7 @@ optimizer_use_forecasts                                    on                  N
 optimizer_use_histograms                                   on                  NULL  user     NULL      on                  on
 optimizer_use_improved_computed_column_filters_derivation  on                  NULL  user     NULL      on                  on
 optimizer_use_improved_disjunction_stats                   on                  NULL  user     NULL      on                  on
+optimizer_use_improved_distinct_on_limit_hint_costing      on                  NULL  user     NULL      on                  on
 optimizer_use_improved_join_elimination                    on                  NULL  user     NULL      on                  on
 optimizer_use_improved_split_disjunction_for_joins         on                  NULL  user     NULL      on                  on
 optimizer_use_limit_ordering_for_streaming_group_by        on                  NULL  user     NULL      on                  on
@@ -3249,6 +3251,7 @@ optimizer_use_forecasts                                    NULL    NULL     NULL
 optimizer_use_histograms                                   NULL    NULL     NULL     NULL        NULL
 optimizer_use_improved_computed_column_filters_derivation  NULL    NULL     NULL     NULL        NULL
 optimizer_use_improved_disjunction_stats                   NULL    NULL     NULL     NULL        NULL
+optimizer_use_improved_distinct_on_limit_hint_costing      NULL    NULL     NULL     NULL        NULL
 optimizer_use_improved_join_elimination                    NULL    NULL     NULL     NULL        NULL
 optimizer_use_improved_split_disjunction_for_joins         NULL    NULL     NULL     NULL        NULL
 optimizer_use_limit_ordering_for_streaming_group_by        NULL    NULL     NULL     NULL        NULL

diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source
@@ -129,6 +129,7 @@ optimizer_use_forecasts                                    on
 optimizer_use_histograms                                   on
 optimizer_use_improved_computed_column_filters_derivation  on
 optimizer_use_improved_disjunction_stats                   on
+optimizer_use_improved_distinct_on_limit_hint_costing      on
 optimizer_use_improved_join_elimination                    on
 optimizer_use_improved_split_disjunction_for_joins         on
 optimizer_use_limit_ordering_for_streaming_group_by        on

diff --git a/pkg/sql/opt/bench/bench_test.go b/pkg/sql/opt/bench/bench_test.go
@@ -732,6 +732,7 @@ func newHarness(tb testing.TB, query benchQuery, schemas []string) *harness {
 	h.evalCtx.SessionData().VariableInequalityLookupJoinEnabled = true
 	h.evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats = true
 	h.evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = true
+	h.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true
 
 	// Set up the test catalog.
 	h.testCat = testcat.New()

diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go
@@ -190,6 +190,7 @@ type Memo struct {
 	plpgsqlUseStrictInto                       bool
 	useVirtualComputedColumnStats              bool
 	useTrigramSimilarityOptimization           bool
+	useImprovedDistinctOnLimitHintCosting      bool
 	trigramSimilarityThreshold                 float64
 	splitScanLimit                             int32
 
@@ -268,6 +269,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
 		plpgsqlUseStrictInto:                       evalCtx.SessionData().PLpgSQLUseStrictInto,
 		useVirtualComputedColumnStats:              evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats,
 		useTrigramSimilarityOptimization:           evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization,
+		useImprovedDistinctOnLimitHintCosting:      evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting,
 		trigramSimilarityThreshold:                 evalCtx.SessionData().TrigramSimilarityThreshold,
 		splitScanLimit:                             evalCtx.SessionData().OptSplitScanLimit,
 		txnIsoLevel:                                evalCtx.TxnIsoLevel,
@@ -424,6 +426,7 @@ func (m *Memo) IsStale(
 		m.plpgsqlUseStrictInto != evalCtx.SessionData().PLpgSQLUseStrictInto ||
 		m.useVirtualComputedColumnStats != evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats ||
 		m.useTrigramSimilarityOptimization != evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization ||
+		m.useImprovedDistinctOnLimitHintCosting != evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting ||
 		m.trigramSimilarityThreshold != evalCtx.SessionData().TrigramSimilarityThreshold ||
 		m.splitScanLimit != evalCtx.SessionData().OptSplitScanLimit ||
 		m.txnIsoLevel != evalCtx.TxnIsoLevel {

diff --git a/pkg/sql/opt/memo/memo_test.go b/pkg/sql/opt/memo/memo_test.go
@@ -448,6 +448,12 @@ func TestMemoIsStale(t *testing.T) {
 	evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = false
 	notStale()
 
+	// Stale optimizer_use_distinct_on_limit_hint_costing.
+	evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true
+	stale()
+	evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = false
+	notStale()
+
 	// Stale pg_trgm.similarity_threshold.
 	evalCtx.SessionData().TrigramSimilarityThreshold = 0.5
 	stale()

diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select
@@ -2573,7 +2573,6 @@ limit
  ├── distinct-on
  │    ├── columns: k:1(int!null) a:2(string) b:3(string) c:4(string)
  │    ├── grouping columns: k:1(int!null)
- │    ├── internal-ordering: +1
  │    ├── stats: [rows=1.9995]
  │    ├── key: (1)
  │    ├── fd: (1)-->(2-4)
@@ -2583,33 +2582,28 @@ limit
  │    │    ├── left columns: k:7(int) a:8(string) b:9(string) c:10(string)
  │    │    ├── right columns: k:13(int) a:14(string) b:15(string) c:16(string)
  │    │    ├── stats: [rows=2]
- │    │    ├── ordering: +1
  │    │    ├── index-join disjunction
  │    │    │    ├── columns: k:7(int!null) a:8(string!null) b:9(string) c:10(string)
  │    │    │    ├── stats: [rows=1, distinct(8)=1, null(8)=0]
  │    │    │    ├── key: (7)
  │    │    │    ├── fd: ()-->(8), (7)-->(9,10)
- │    │    │    ├── ordering: +7 opt(8) [actual: +7]
  │    │    │    └── scan disjunction@a_idx
  │    │    │         ├── columns: k:7(int!null) a:8(string!null)
  │    │    │         ├── constraint: /8/7: [/'foo' - /'foo']
  │    │    │         ├── stats: [rows=1, distinct(8)=1, null(8)=0]
  │    │    │         ├── key: (7)
- │    │    │         ├── fd: ()-->(8)
- │    │    │         └── ordering: +7 opt(8) [actual: +7]
+ │    │    │         └── fd: ()-->(8)
  │    │    └── index-join disjunction
  │    │         ├── columns: k:13(int!null) a:14(string) b:15(string!null) c:16(string)
  │    │         ├── stats: [rows=1, distinct(15)=1, null(15)=0]
  │    │         ├── key: (13)
  │    │         ├── fd: ()-->(15), (13)-->(14,16)
- │    │         ├── ordering: +13 opt(15) [actual: +13]
  │    │         └── scan disjunction@b_idx
  │    │              ├── columns: k:13(int!null) b:15(string!null)
  │    │              ├── constraint: /15/13: [/'foo' - /'foo']
  │    │              ├── stats: [rows=1, distinct(15)=1, null(15)=0]
  │    │              ├── key: (13)
- │    │              ├── fd: ()-->(15)
- │    │              └── ordering: +13 opt(15) [actual: +13]
+ │    │              └── fd: ()-->(15)
  │    └── aggregations
  │         ├── const-agg [as=a:2, type=string, outer=(2)]
  │         │    └── a:2 [type=string]
@@ -2632,7 +2626,6 @@ limit
  ├── distinct-on
  │    ├── columns: k:1(int!null) a:2(string) b:3(string) c:4(string)
  │    ├── grouping columns: k:1(int!null)
- │    ├── internal-ordering: +1
  │    ├── stats: [rows=2.998501]
  │    ├── key: (1)
  │    ├── fd: (1)-->(2-4)
@@ -2642,27 +2635,24 @@ limit
  │    │    ├── left columns: k:7(int) a:8(string) b:9(string) c:10(string)
  │    │    ├── right columns: k:13(int) a:14(string) b:15(string) c:16(string)
  │    │    ├── stats: [rows=2.999501]
- │    │    ├── ordering: +1
  │    │    ├── index-join disjunction
  │    │    │    ├── columns: k:7(int!null) a:8(string!null) b:9(string) c:10(string)
  │    │    │    ├── stats: [rows=1, distinct(8)=1, null(8)=0]
  │    │    │    ├── key: (7)
  │    │    │    ├── fd: ()-->(8), (7)-->(9,10)
- │    │    │    ├── ordering: +7 opt(8) [actual: +7]
  │    │    │    └── scan disjunction@a_idx
  │    │    │         ├── columns: k:7(int!null) a:8(string!null)
  │    │    │         ├── constraint: /8/7: [/'foo' - /'foo']
  │    │    │         ├── stats: [rows=1, distinct(8)=1, null(8)=0]
  │    │    │         ├── key: (7)
- │    │    │         ├── fd: ()-->(8)
- │    │    │         └── ordering: +7 opt(8) [actual: +7]
+ │    │    │         └── fd: ()-->(8)
  │    │    └── distinct-on
  │    │         ├── columns: k:13(int!null) a:14(string) b:15(string) c:16(string)
  │    │         ├── grouping columns: k:13(int!null)
+ │    │         ├── internal-ordering: +13
  │    │         ├── stats: [rows=1.9995]
  │    │         ├── key: (13)
  │    │         ├── fd: (13)-->(14-16)
- │    │         ├── ordering: +13
  │    │         ├── union-all
  │    │         │    ├── columns: k:13(int!null) a:14(string) b:15(string) c:16(string)
  │    │         │    ├── left columns: k:19(int) a:20(string) b:21(string) c:22(string)

diff --git a/pkg/sql/opt/norm/testdata/rules/limit b/pkg/sql/opt/norm/testdata/rules/limit
@@ -258,15 +258,16 @@ project
  │    ├── cardinality: [0 - 5]
  │    ├── key: (2,3)
  │    ├── ordering: +3
- │    ├── sort
+ │    ├── distinct-on
  │    │    ├── columns: i:2 f:3
+ │    │    ├── grouping columns: i:2 f:3
  │    │    ├── key: (2,3)
  │    │    ├── ordering: +3
  │    │    ├── limit hint: 5.00
- │    │    └── distinct-on
+ │    │    └── sort
  │    │         ├── columns: i:2 f:3
- │    │         ├── grouping columns: i:2 f:3
- │    │         ├── key: (2,3)
+ │    │         ├── ordering: +3
+ │    │         ├── limit hint: 6.02
  │    │         └── scan a
  │    │              └── columns: i:2 f:3
  │    └── 5
@@ -449,15 +450,16 @@ project
  │    │    ├── cardinality: [0 - 15]
  │    │    ├── key: (2,3)
  │    │    ├── ordering: +3
- │    │    ├── sort
+ │    │    ├── distinct-on
  │    │    │    ├── columns: i:2 f:3
+ │    │    │    ├── grouping columns: i:2 f:3
  │    │    │    ├── key: (2,3)
  │    │    │    ├── ordering: +3
  │    │    │    ├── limit hint: 15.00
- │    │    │    └── distinct-on
+ │    │    │    └── sort
  │    │    │         ├── columns: i:2 f:3
- │    │    │         ├── grouping columns: i:2 f:3
- │    │    │         ├── key: (2,3)
+ │    │    │         ├── ordering: +3
+ │    │    │         ├── limit hint: 18.16
  │    │    │         └── scan a
  │    │    │              └── columns: i:2 f:3
  │    │    └── 15

diff --git a/pkg/sql/opt/testutils/opttester/opt_tester.go b/pkg/sql/opt/testutils/opttester/opt_tester.go
@@ -315,6 +315,7 @@ func New(catalog cat.Catalog, sql string) *OptTester {
 	ot.evalCtx.SessionData().OptimizerMergeJoinsEnabled = true
 	ot.evalCtx.SessionData().OptimizerUseVirtualComputedColumnStats = true
 	ot.evalCtx.SessionData().OptimizerUseTrigramSimilarityOptimization = true
+	ot.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting = true
 	ot.evalCtx.SessionData().TrigramSimilarityThreshold = 0.3
 
 	return ot

diff --git a/pkg/sql/opt/xform/coster.go b/pkg/sql/opt/xform/coster.go
@@ -1405,12 +1405,18 @@ func (c *coster) computeGroupingCost(grouping memo.RelExpr, required *physical.R
 	// Normally, a grouping expression must process each input row once.
 	inputRowCount := grouping.Child(0).(memo.RelExpr).Relational().Statistics().RowCount
 
-	// If this is a streaming GroupBy with a limit hint, l, we only need to
-	// process enough input rows to output l rows.
+	// If this is a streaming GroupBy or a DistinctOn with a limit hint, l, we
+	// only need to process enough input rows to output l rows.
 	streamingType := private.GroupingOrderType(&required.Ordering)
-	if (streamingType != memo.NoStreaming) && grouping.Op() == opt.GroupByOp && required.LimitHint > 0 {
-		inputRowCount = streamingGroupByInputLimitHint(inputRowCount, outputRowCount, required.LimitHint)
-		outputRowCount = math.Min(outputRowCount, required.LimitHint)
+	if required.LimitHint > 0 {
+		if grouping.Op() == opt.GroupByOp && streamingType != memo.NoStreaming {
+			inputRowCount = streamingGroupByInputLimitHint(inputRowCount, outputRowCount, required.LimitHint)
+			outputRowCount = math.Min(outputRowCount, required.LimitHint)
+		} else if grouping.Op() == opt.DistinctOnOp &&
+			c.evalCtx.SessionData().OptimizerUseImprovedDistinctOnLimitHintCosting {
+			inputRowCount = distinctOnLimitHint(outputRowCount, required.LimitHint)
+			outputRowCount = math.Min(outputRowCount, required.LimitHint)
+		}
 	}
 
 	// Cost per row depends on the number of grouping columns and the number of

diff --git a/pkg/sql/opt/xform/testdata/coster/groupby b/pkg/sql/opt/xform/testdata/coster/groupby
@@ -129,6 +129,113 @@ limit
  │         └── count-rows [as=count_rows:7]
  └── 10
 
+opt
+SELECT DISTINCT a FROM b
+----
+distinct-on
+ ├── columns: a:2
+ ├── grouping columns: a:2
+ ├── internal-ordering: +2
+ ├── stats: [rows=100, distinct(2)=100, null(2)=1]
+ ├── cost: 1079.45
+ ├── key: (2)
+ └── scan b@b_a_b_idx
+      ├── columns: a:2
+      ├── stats: [rows=1000, distinct(2)=100, null(2)=10]
+      ├── cost: 1068.42
+      └── ordering: +2
+
+opt
+SELECT DISTINCT a FROM b LIMIT 10
+----
+limit
+ ├── columns: a:2
+ ├── cardinality: [0 - 10]
+ ├── stats: [rows=10]
+ ├── cost: 32.5488337
+ ├── key: (2)
+ ├── distinct-on
+ │    ├── columns: a:2
+ │    ├── grouping columns: a:2
+ │    ├── internal-ordering: +2
+ │    ├── stats: [rows=100, distinct(2)=100, null(2)=1]
+ │    ├── cost: 32.4388337
+ │    ├── key: (2)
+ │    ├── limit hint: 10.00
+ │    └── scan b@b_a_b_idx
+ │         ├── columns: a:2
+ │         ├── stats: [rows=1000, distinct(2)=100, null(2)=10]
+ │         ├── cost: 31.281321
+ │         ├── ordering: +2
+ │         └── limit hint: 12.75
+ └── 10
+
+opt set=(optimizer_use_improved_distinct_on_limit_hint_costing=false)
+SELECT DISTINCT a FROM b LIMIT 10
+----
+limit
+ ├── columns: a:2
+ ├── cardinality: [0 - 10]
+ ├── stats: [rows=10]
+ ├── cost: 42.421321
+ ├── key: (2)
+ ├── distinct-on
+ │    ├── columns: a:2
+ │    ├── grouping columns: a:2
+ │    ├── internal-ordering: +2
+ │    ├── stats: [rows=100, distinct(2)=100, null(2)=1]
+ │    ├── cost: 42.311321
+ │    ├── key: (2)
+ │    ├── limit hint: 10.00
+ │    └── scan b@b_a_b_idx
+ │         ├── columns: a:2
+ │         ├── stats: [rows=1000, distinct(2)=100, null(2)=10]
+ │         ├── cost: 31.281321
+ │         ├── ordering: +2
+ │         └── limit hint: 12.75
+ └── 10
+
+opt
+SELECT DISTINCT a, b FROM b
+----
+distinct-on
+ ├── columns: a:2 b:3
+ ├── grouping columns: a:2 b:3
+ ├── internal-ordering: +2,+3
+ ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
+ ├── cost: 1108.55
+ ├── key: (2,3)
+ └── scan b@b_a_b_idx
+      ├── columns: a:2 b:3
+      ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
+      ├── cost: 1078.52
+      └── ordering: +2,+3
+
+opt
+SELECT DISTINCT a, b FROM b LIMIT 10
+----
+limit
+ ├── columns: a:2 b:3
+ ├── cardinality: [0 - 10]
+ ├── stats: [rows=10]
+ ├── cost: 41.0744319
+ ├── key: (2,3)
+ ├── distinct-on
+ │    ├── columns: a:2 b:3
+ │    ├── grouping columns: a:2 b:3
+ │    ├── internal-ordering: +2,+3
+ │    ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
+ │    ├── cost: 40.9644319
+ │    ├── key: (2,3)
+ │    ├── limit hint: 10.00
+ │    └── scan b@b_a_b_idx
+ │         ├── columns: a:2 b:3
+ │         ├── stats: [rows=1000, distinct(2,3)=1000, null(2,3)=0.1]
+ │         ├── cost: 30.6930407
+ │         ├── ordering: +2,+3
+ │         └── limit hint: 12.07
+ └── 10
+
 # Partially ordered group by with a limit hint.
 opt
 SELECT a, c, count(*) FROM c GROUP BY a, c LIMIT 10