Skip to content

Commit

Permalink
opt: hoist uncorrelated equality subqueries
Browse files Browse the repository at this point in the history
Subqueries that are in equality expressions with a variable are now
hoisted. When these expressions exist in a filter, hoisting the subquery
can allow the main query to plan a lookup join, rather than an
inefficient full-table scan.

For example, consider the table and query:

    CREATE TABLE t (
      a INT,
      INDEX (a)
    );

    SELECT * FROM t WHERE a = (SELECT max(a) FROM t);

Prior to this commit, the query plan for this query required a full
table scan:

    select
     ├── columns: a:1
     ├── scan t@t_a_idx
     │    ├── columns: a:1
     │    └── constraint: /1/2: (/NULL - ]
     └── filters
          └── eq
               ├── a:1
               └── subquery
                    └── scalar-group-by
                         ├── columns: max:9
                         ├── scan t@t_a_idx,rev
                         │    ├── columns: a:5
                         │    ├── constraint: /5/6: (/NULL - ]
                         │    └── limit: 1(rev)
                         └── aggregations
                              └── const-agg [as=max:9, outer=(5)]
                                   └── a:5

By hoisting the subquery, the full table scan is replaced with a lookup
join:

    project
     ├── columns: a:1
     └── inner-join (lookup t@t_a_idx)
          ├── columns: a:1 max:9
          ├── key columns: [9] = [1]
          ├── scalar-group-by
          │    ├── columns: max:9
          │    ├── scan t@t_a_idx,rev
          │    │    ├── columns: a:5
          │    │    ├── constraint: /5/6: (/NULL - ]
          │    │    └── limit: 1(rev)
          │    └── aggregations
          │         └── const-agg [as=max:9, outer=(5)]
          │              └── a:5
          └── filters (true)

This hoisting is disabled by default, but can be enabled by setting the
`optimizer_hoist_uncorrelated_equality_subqueries` session setting to
`true`.

Fixes cockroachdb#83392
Informs cockroachdb#51820
Informs cockroachdb#93829
Informs cockroachdb#100855

Release note (performance improvement): Queries that have subqueries in
equality expressions are now more efficiently planned by the optimizer when
`optimizer_hoist_uncorrelated_equality_subqueries` is set to `true`.
  • Loading branch information
mgartner committed Apr 18, 2023
1 parent 5b26e53 commit 4627cbe
Show file tree
Hide file tree
Showing 12 changed files with 292 additions and 11 deletions.
4 changes: 4 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3394,6 +3394,10 @@ func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) {
m.data.OptimizerAlwaysUseHistograms = val
}

func (m *sessionDataMutator) SetOptimizerHoistUncorrelatedEqualitySubqueries(val bool) {
m.data.OptimizerHoistUncorrelatedEqualitySubqueries = val
}

func (m *sessionDataMutator) SetPreparedStatementsCacheSize(val int64) {
m.data.PreparedStatementsCacheSize = val
}
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/information_schema
Original file line number Diff line number Diff line change
Expand Up @@ -4785,6 +4785,7 @@ on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer on
optimizer_always_use_histograms off
optimizer_hoist_uncorrelated_equality_subqueries off
optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats off
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/pg_catalog
Original file line number Diff line number Diff line change
Expand Up @@ -2807,6 +2807,7 @@ null_ordered_last off NULL
on_update_rehome_row_enabled on NULL NULL NULL string
opt_split_scan_limit 2048 NULL NULL NULL string
optimizer_always_use_histograms off NULL NULL NULL string
optimizer_hoist_uncorrelated_equality_subqueries off NULL NULL NULL string
optimizer_use_forecasts on NULL NULL NULL string
optimizer_use_histograms on NULL NULL NULL string
optimizer_use_improved_disjunction_stats off NULL NULL NULL string
Expand Down Expand Up @@ -2952,6 +2953,7 @@ null_ordered_last off NULL
on_update_rehome_row_enabled on NULL user NULL on on
opt_split_scan_limit 2048 NULL user NULL 2048 2048
optimizer_always_use_histograms off NULL user NULL off off
optimizer_hoist_uncorrelated_equality_subqueries off NULL user NULL on on
optimizer_use_forecasts on NULL user NULL on on
optimizer_use_histograms on NULL user NULL on on
optimizer_use_improved_disjunction_stats off NULL user NULL off off
Expand Down Expand Up @@ -3095,6 +3097,7 @@ on_update_rehome_row_enabled NULL NULL NULL
opt_split_scan_limit NULL NULL NULL NULL NULL
optimizer NULL NULL NULL NULL NULL
optimizer_always_use_histograms NULL NULL NULL NULL NULL
optimizer_hoist_uncorrelated_equality_subqueries NULL NULL NULL NULL NULL
optimizer_use_forecasts NULL NULL NULL NULL NULL
optimizer_use_histograms NULL NULL NULL NULL NULL
optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/show_source
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ null_ordered_last off
on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer_always_use_histograms off
optimizer_hoist_uncorrelated_equality_subqueries off
optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats off
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/opt/exec/execbuilder/testdata/udf
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ query T kvtrace
SELECT fetch_a_of_2_strict(1, NULL::INT)
----


# Regression test for #88259. Arguments to UDFs should be formatted correctly in
# EXPLAIN output.
statement ok
Expand Down
5 changes: 4 additions & 1 deletion pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ type Memo struct {
useLimitOrderingForStreamingGroupBy bool
useImprovedSplitDisjunctionForJoins bool
alwaysUseHistograms bool
hoistUncorrelatedEqualitySubqueries bool

// curRank is the highest currently in-use scalar expression rank.
curRank opt.ScalarRank
Expand Down Expand Up @@ -219,6 +220,7 @@ func (m *Memo) Init(evalCtx *eval.Context) {
useLimitOrderingForStreamingGroupBy: evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy,
useImprovedSplitDisjunctionForJoins: evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins,
alwaysUseHistograms: evalCtx.SessionData().OptimizerAlwaysUseHistograms,
hoistUncorrelatedEqualitySubqueries: evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries,
}
m.metadata.Init()
m.logPropsBuilder.init(evalCtx, m)
Expand Down Expand Up @@ -359,7 +361,8 @@ func (m *Memo) IsStale(
m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats ||
m.useLimitOrderingForStreamingGroupBy != evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy ||
m.useImprovedSplitDisjunctionForJoins != evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins ||
m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms {
m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms ||
m.hoistUncorrelatedEqualitySubqueries != evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries {
return true, nil
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/opt/memo/memo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,12 @@ func TestMemoIsStale(t *testing.T) {
evalCtx.SessionData().OptimizerAlwaysUseHistograms = false
notStale()

// Stale optimizer_hoist_uncorrelated_equality_subqueries.
evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = true
stale()
evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = false
notStale()

// Stale data sources and schema. Create new catalog so that data sources are
// recreated and can be modified independently.
catalog = testcat.New()
Expand Down
23 changes: 22 additions & 1 deletion pkg/sql/opt/norm/decorrelate_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,20 @@ func (c *CustomFuncs) deriveHasHoistableSubquery(scalar opt.ScalarExpr) bool {
case *memo.UDFExpr:
// Do not attempt to hoist UDFs.
return false

case *memo.EqExpr:
// Hoist subqueries in expressions like (Eq (Variable) (Subquery)) if
// the corresponding session setting is enabled.
// TODO(mgartner): We could hoist if we have an IS NOT DISTINCT FROM
// expression. But it won't currently lead to a lookup join due to
// #100855 and the plan could be worse, so we avoid it for now.
if c.f.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries {
_, isLeftVar := scalar.Child(0).(*memo.VariableExpr)
_, isRightSubquery := scalar.Child(1).(*memo.SubqueryExpr)
if isLeftVar && isRightSubquery {
return true
}
}
}

// If HasHoistableSubquery is true for any child, then it's true for this
Expand Down Expand Up @@ -796,7 +810,14 @@ func (r *subqueryHoister) hoistAll(scalar opt.ScalarExpr) opt.ScalarExpr {
switch scalar.Op() {
case opt.SubqueryOp, opt.ExistsOp, opt.AnyOp, opt.ArrayFlattenOp:
subquery := scalar.Child(0).(memo.RelExpr)
if subquery.Relational().OuterCols.Empty() {
// According to the implementation of deriveHasHoistableSubquery,
// Exists, Any, and ArrayFlatten expressions are only hoistable if they
// are correlated. Uncorrelated subquery expressions are hoistable if
// the corresponding session setting is enabled and they are part of an
// equality expression with a variable.
uncorrelatedHoistAllowed := scalar.Op() == opt.SubqueryOp &&
r.f.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries
if subquery.Relational().OuterCols.Empty() && !uncorrelatedHoistAllowed {
break
}

Expand Down
235 changes: 226 additions & 9 deletions pkg/sql/opt/norm/testdata/rules/decorrelate
Original file line number Diff line number Diff line change
Expand Up @@ -4585,6 +4585,128 @@ project
│ └── 1
└── true_agg:17 IS NOT NULL

# Hoist an uncorrelated equality subquery.
norm expect=HoistSelectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT * FROM a WHERE k = (SELECT max(x) FROM xy)
----
project
├── columns: k:1!null i:2 f:3 s:4 j:5
├── cardinality: [0 - 1]
├── key: ()
├── fd: ()-->(1-5)
└── inner-join (hash)
├── columns: k:1!null i:2 f:3 s:4 j:5 max:12!null
├── cardinality: [0 - 1]
├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-one)
├── key: ()
├── fd: ()-->(1-5,12), (12)==(1), (1)==(12)
├── scan a
│ ├── columns: k:1!null i:2 f:3 s:4 j:5
│ ├── key: (1)
│ └── fd: (1)-->(2-5)
├── scalar-group-by
│ ├── columns: max:12
│ ├── cardinality: [1 - 1]
│ ├── key: ()
│ ├── fd: ()-->(12)
│ ├── scan xy
│ │ ├── columns: x:8!null
│ │ └── key: (8)
│ └── aggregations
│ └── max [as=max:12, outer=(8)]
│ └── x:8
└── filters
└── k:1 = max:12 [outer=(1,12), constraints=(/1: (/NULL - ]; /12: (/NULL - ]), fd=(1)==(12), (12)==(1)]

# Hoist an uncorrelated equality subquery that could return multiple rows.
norm expect=HoistSelectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT * FROM a WHERE k = (SELECT x FROM xy)
----
project
├── columns: k:1!null i:2 f:3 s:4 j:5
├── cardinality: [0 - 1]
├── key: ()
├── fd: ()-->(1-5)
└── inner-join (hash)
├── columns: k:1!null i:2 f:3 s:4 j:5 x:8!null
├── cardinality: [0 - 1]
├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-one)
├── key: ()
├── fd: ()-->(1-5,8), (8)==(1), (1)==(8)
├── scan a
│ ├── columns: k:1!null i:2 f:3 s:4 j:5
│ ├── key: (1)
│ └── fd: (1)-->(2-5)
├── max1-row
│ ├── columns: x:8!null
│ ├── error: "more than one row returned by a subquery used as an expression"
│ ├── cardinality: [0 - 1]
│ ├── key: ()
│ ├── fd: ()-->(8)
│ └── scan xy
│ ├── columns: x:8!null
│ └── key: (8)
└── filters
└── k:1 = x:8 [outer=(1,8), constraints=(/1: (/NULL - ]; /8: (/NULL - ]), fd=(1)==(8), (8)==(1)]

# Do not hoist an uncorrelated equality subquery if the corresponding session
# setting is disabled.
norm expect-not=HoistSelectSubquery
SELECT * FROM a WHERE k = (SELECT max(x) FROM xy)
----
select
├── columns: k:1!null i:2 f:3 s:4 j:5
├── key: (1)
├── fd: (1)-->(2-5)
├── scan a
│ ├── columns: k:1!null i:2 f:3 s:4 j:5
│ ├── key: (1)
│ └── fd: (1)-->(2-5)
└── filters
└── eq [outer=(1), subquery, constraints=(/1: (/NULL - ])]
├── k:1
└── subquery
└── scalar-group-by
├── columns: max:12
├── cardinality: [1 - 1]
├── key: ()
├── fd: ()-->(12)
├── scan xy
│ ├── columns: x:8!null
│ └── key: (8)
└── aggregations
└── max [as=max:12, outer=(8)]
└── x:8

# Do not hoist an uncorrelated inequality subquery. We have not yet proven that
# it will lead to a better plan.
norm expect-not=HoistSelectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT * FROM a WHERE k < (SELECT max(x) FROM xy)
----
select
├── columns: k:1!null i:2 f:3 s:4 j:5
├── key: (1)
├── fd: (1)-->(2-5)
├── scan a
│ ├── columns: k:1!null i:2 f:3 s:4 j:5
│ ├── key: (1)
│ └── fd: (1)-->(2-5)
└── filters
└── lt [outer=(1), subquery, constraints=(/1: (/NULL - ])]
├── k:1
└── subquery
└── scalar-group-by
├── columns: max:12
├── cardinality: [1 - 1]
├── key: ()
├── fd: ()-->(12)
├── scan xy
│ ├── columns: x:8!null
│ └── key: (8)
└── aggregations
└── max [as=max:12, outer=(8)]
└── x:8

# --------------------------------------------------
# HoistProjectSubquery
# --------------------------------------------------
Expand Down Expand Up @@ -4896,19 +5018,114 @@ values
│ └── y:9 = i:2 [outer=(2,9), constraints=(/2: (/NULL - ]; /9: (/NULL - ]), fd=(2)==(9), (9)==(2)]
└── 1

# Don't hoist uncorrelated subquery.
norm
SELECT i < ANY(SELECT y FROM xy) AS r FROM a
# Hoist an uncorrelated equality subquery.
norm expect=HoistProjectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT k = (SELECT max(x) FROM xy) FROM a
----
project
├── columns: r:12
├── columns: "?column?":13
├── inner-join (cross)
│ ├── columns: k:1!null max:12
│ ├── multiplicity: left-rows(exactly-one), right-rows(zero-or-more)
│ ├── key: (1)
│ ├── fd: ()-->(12)
│ ├── scan a
│ │ ├── columns: k:1!null
│ │ └── key: (1)
│ ├── scalar-group-by
│ │ ├── columns: max:12
│ │ ├── cardinality: [1 - 1]
│ │ ├── key: ()
│ │ ├── fd: ()-->(12)
│ │ ├── scan xy
│ │ │ ├── columns: x:8!null
│ │ │ └── key: (8)
│ │ └── aggregations
│ │ └── max [as=max:12, outer=(8)]
│ │ └── x:8
│ └── filters (true)
└── projections
└── k:1 = max:12 [as="?column?":13, outer=(1,12)]

# Hoist an uncorrelated equality subquery that could return multiple rows.
norm expect=HoistProjectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT k = (SELECT x FROM xy) FROM a
----
project
├── columns: "?column?":12
├── left-join (cross)
│ ├── columns: k:1!null x:8
│ ├── multiplicity: left-rows(exactly-one), right-rows(zero-or-more)
│ ├── key: (1)
│ ├── fd: (1)-->(8)
│ ├── scan a
│ │ ├── columns: k:1!null
│ │ └── key: (1)
│ ├── max1-row
│ │ ├── columns: x:8!null
│ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ ├── cardinality: [0 - 1]
│ │ ├── key: ()
│ │ ├── fd: ()-->(8)
│ │ └── scan xy
│ │ ├── columns: x:8!null
│ │ └── key: (8)
│ └── filters (true)
└── projections
└── k:1 = x:8 [as="?column?":12, outer=(1,8)]

# Do not hoist an uncorrelated equality subquery if the corresponding session
# setting is disabled.
norm expect-not=HoistProjectSubquery
SELECT k = (SELECT max(x) FROM xy) FROM a
----
project
├── columns: "?column?":13
├── scan a
│ └── columns: i:2
│ ├── columns: k:1!null
│ └── key: (1)
└── projections
└── eq [as="?column?":13, outer=(1), subquery]
├── k:1
└── subquery
└── scalar-group-by
├── columns: max:12
├── cardinality: [1 - 1]
├── key: ()
├── fd: ()-->(12)
├── scan xy
│ ├── columns: x:8!null
│ └── key: (8)
└── aggregations
└── max [as=max:12, outer=(8)]
└── x:8

# Do not hoist an uncorrelated inequality subquery. We have not yet proven that
# it will lead to a better plan.
norm expect-not=HoistProjectSubquery set=optimizer_hoist_uncorrelated_equality_subqueries=on
SELECT k < (SELECT max(x) FROM xy) FROM a
----
project
├── columns: "?column?":13
├── scan a
│ ├── columns: k:1!null
│ └── key: (1)
└── projections
└── any: lt [as=r:12, outer=(2), correlated-subquery]
├── scan xy
│ └── columns: y:9
└── i:2
└── lt [as="?column?":13, outer=(1), subquery]
├── k:1
└── subquery
└── scalar-group-by
├── columns: max:12
├── cardinality: [1 - 1]
├── key: ()
├── fd: ()-->(12)
├── scan xy
│ ├── columns: x:8!null
│ └── key: (8)
└── aggregations
└── max [as=max:12, outer=(8)]
└── x:8


# --------------------------------------------------
# HoistJoinSubquery
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/opt/testutils/opttester/opt_tester.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ func New(catalog cat.Catalog, sql string) *OptTester {
ot.evalCtx.SessionData().OptSplitScanLimit = tabledesc.MaxBucketAllowed
ot.evalCtx.SessionData().VariableInequalityLookupJoinEnabled = true
ot.evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins = true
ot.evalCtx.SessionData().OptimizerAlwaysUseHistograms = true
// ot.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = true

return ot
}
Expand Down
Loading

0 comments on commit 4627cbe

Please sign in to comment.