Skip to content

Commit

Permalink
opt: hoist uncorrelated subqueries at most once
Browse files Browse the repository at this point in the history
Since #100881, the optimizer has hoisted uncorrelated subqueries used in
an equality expression (only when the
`optimizer_hoist_uncorrelated_equality_subqueries` session setting is
enabled). This can cause problems when the hoisted subquery has been
duplicated in the expression tree, e.g., when pushing a filter into both
sides of a join.

A subquery is a scalar expression, so the columns of its child
expression are never emitted from the subquery. This makes it safe
duplicate a subquery in an expression tree. However, when a subquery is
hoisted, it is transformed into a join which can produce the columns of
the child expression. Hoisting the same subquery multiple times can
produce query plans with duplicate column IDs in two logically different
expressions. This can lead to incorrect query plans (see the comment for
`opt.Metadata`), as well as produce expressions with children that have
intersecting column IDs (after additional normalization rules fire).

To avoid these dangers, this commit ensures that each unique subquery is
hoisted at most once. This will prevent bad plans, but it may not
inhibit the optimizer from finding optimal plans. In the future, it may
be possible to lift this restriction by generating new column ideas for
uncorrelated subqueries each time they are hoisted.

Fixes #114703

There is no release note because the session setting enabling this bug
is disabled by default, and because the possible correctness bug is
theoretical - we have not found a reproduction of a correctness bug, but
it could exist in theory.

Release note: None
  • Loading branch information
mgartner committed Nov 27, 2023
1 parent 078c69a commit bf39fbf
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 10 deletions.
22 changes: 22 additions & 0 deletions pkg/sql/opt/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ type Metadata struct {
// mutation operators, used to determine the logical properties of WithScan.
withBindings map[WithID]Expr

// hoistedUncorrelatedSubqueries is used to track uncorrelated subqueries
// that have been hoisted.
hoistedUncorrelatedSubqueries map[Expr]struct{}

// dataSourceDeps stores each data source object that the query depends on.
dataSourceDeps map[cat.StableID]cat.DataSource

Expand Down Expand Up @@ -1062,6 +1066,24 @@ func (md *Metadata) ForEachWithBinding(fn func(WithID, Expr)) {
}
}

// AddHoistedUncorrelatedSubquery marks the given uncorrelated subquery
// expression as hoisted. It is used to prevent hoisting the same uncorrelated
// subquery twice because that may cause two children of an expression to have
// intersecting columns (see #114703).
func (md *Metadata) AddHoistedUncorrelatedSubquery(subquery Expr) {
if md.hoistedUncorrelatedSubqueries == nil {
md.hoistedUncorrelatedSubqueries = make(map[Expr]struct{})
}
md.hoistedUncorrelatedSubqueries[subquery] = struct{}{}
}

// IsHoistedUncorrelatedSubquery returns true if the given subquery was
// previously marked as hoisted with AddHoistedUncorrelatedSubquery.
func (md *Metadata) IsHoistedUncorrelatedSubquery(subquery Expr) bool {
_, ok := md.hoistedUncorrelatedSubqueries[subquery]
return ok
}

// TestingDataSourceDeps exposes the dataSourceDeps for testing.
func (md *Metadata) TestingDataSourceDeps() map[cat.StableID]cat.DataSource {
return md.dataSourceDeps
Expand Down
33 changes: 23 additions & 10 deletions pkg/sql/opt/norm/decorrelate_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,22 @@ func (c *CustomFuncs) deriveHasHoistableSubquery(scalar opt.ScalarExpr) bool {
return false

case *memo.EqExpr:
// Hoist subqueries in expressions like (Eq (Variable) (Subquery)) if
// the corresponding session setting is enabled.
// Hoist subqueries in expressions like (Eq (Variable) (Subquery)) if:
//
// 1. The corresponding session setting is enabled.
// 2. And, the subquery has not already been hoisted elsewhere in the
// expression tree. Hoisting the same subquery twice could result
// in query plans where two children of an expression have
// intersecting columns (see #114703).
//
// TODO(mgartner): We could hoist if we have an IS NOT DISTINCT FROM
// expression. But it won't currently lead to a lookup join due to
// #100855 and the plan could be worse, so we avoid it for now.
if c.f.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries {
_, isLeftVar := scalar.Child(0).(*memo.VariableExpr)
_, isRightSubquery := scalar.Child(1).(*memo.SubqueryExpr)
if isLeftVar && isRightSubquery {
subquery, isRightSubquery := scalar.Child(1).(*memo.SubqueryExpr)
if isLeftVar && isRightSubquery &&
!c.f.Metadata().IsHoistedUncorrelatedSubquery(subquery) {
return true
}
}
Expand Down Expand Up @@ -824,12 +831,18 @@ func (r *subqueryHoister) hoistAll(scalar opt.ScalarExpr) opt.ScalarExpr {
// According to the implementation of deriveHasHoistableSubquery,
// Exists, Any, and ArrayFlatten expressions are only hoistable if they
// are correlated. Uncorrelated subquery expressions are hoistable if
// the corresponding session setting is enabled and they are part of an
// equality expression with a variable.
uncorrelatedHoistAllowed := scalar.Op() == opt.SubqueryOp &&
r.f.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries
if subquery.Relational().OuterCols.Empty() && !uncorrelatedHoistAllowed {
break
// the corresponding session setting is enabled, they are part of an
// equality expression with a variable, and they have not already been
// hoisted elsewhere in the expression tree.
if subquery.Relational().OuterCols.Empty() {
uncorrelatedHoistAllowed := scalar.Op() == opt.SubqueryOp &&
r.f.evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries &&
!r.f.Metadata().IsHoistedUncorrelatedSubquery(scalar)
if !uncorrelatedHoistAllowed {
break
}
// Mark the subquery as being hoisted.
r.f.Metadata().AddHoistedUncorrelatedSubquery(scalar)
}

switch t := scalar.(type) {
Expand Down
171 changes: 171 additions & 0 deletions pkg/sql/opt/norm/testdata/rules/decorrelate
Original file line number Diff line number Diff line change
Expand Up @@ -4847,6 +4847,177 @@ select
└── max [as=max:12, outer=(8)]
└── x:8

# The subquery should only be hoisted once to avoid creating an expression with
# children that have intersecting columns. See #114703.
norm expect=HoistSelectSubquery
SELECT NULL
FROM a AS t1
JOIN a AS t2 ON t1.i = t2.i
WHERE t2.i = (SELECT 0 FROM a)
----
project
├── columns: "?column?":23
├── fd: ()-->(23)
├── inner-join (hash)
│ ├── columns: t1.i:2!null t2.i:9!null "?column?":22!null
│ ├── fd: ()-->(2,9,22), (2)==(9,22), (22)==(2,9), (9)==(2,22)
│ ├── inner-join (hash)
│ │ ├── columns: t1.i:2!null "?column?":22!null
│ │ ├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-more)
│ │ ├── fd: ()-->(2,22), (2)==(22), (22)==(2)
│ │ ├── scan a [as=t1]
│ │ │ └── columns: t1.i:2
│ │ ├── max1-row
│ │ │ ├── columns: "?column?":22!null
│ │ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ │ ├── cardinality: [0 - 1]
│ │ │ ├── key: ()
│ │ │ ├── fd: ()-->(22)
│ │ │ └── project
│ │ │ ├── columns: "?column?":22!null
│ │ │ ├── fd: ()-->(22)
│ │ │ ├── scan a
│ │ │ └── projections
│ │ │ └── 0 [as="?column?":22]
│ │ └── filters
│ │ └── t1.i:2 = "?column?":22 [outer=(2,22), constraints=(/2: (/NULL - ]; /22: (/NULL - ]), fd=(2)==(22), (22)==(2)]
│ ├── select
│ │ ├── columns: t2.i:9!null
│ │ ├── scan a [as=t2]
│ │ │ └── columns: t2.i:9
│ │ └── filters
│ │ └── eq [outer=(9), subquery, constraints=(/9: (/NULL - ])]
│ │ ├── t2.i:9
│ │ └── subquery
│ │ └── max1-row
│ │ ├── columns: "?column?":22!null
│ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ ├── cardinality: [0 - 1]
│ │ ├── key: ()
│ │ ├── fd: ()-->(22)
│ │ └── project
│ │ ├── columns: "?column?":22!null
│ │ ├── fd: ()-->(22)
│ │ ├── scan a
│ │ └── projections
│ │ └── 0 [as="?column?":22]
│ └── filters
│ └── t1.i:2 = t2.i:9 [outer=(2,9), constraints=(/2: (/NULL - ]; /9: (/NULL - ]), fd=(2)==(9), (9)==(2)]
└── projections
└── NULL [as="?column?":23]

# Each subquery should only be hoisted once.
norm
SELECT NULL
FROM a AS t1
JOIN a AS t2 ON t1.i = t2.i
WHERE t2.i = (SELECT 0 FROM a) AND t1.i = (SELECT 0 FROM a)
----
project
├── columns: "?column?":31
├── fd: ()-->(31)
├── inner-join (hash)
│ ├── columns: t1.i:2!null t2.i:9!null "?column?":22!null "?column?":30!null
│ ├── fd: ()-->(2,9,22,30), (2)==(9,22,30), (30)==(2,9,22), (22)==(2,9,30), (9)==(2,22,30)
│ ├── inner-join (hash)
│ │ ├── columns: t1.i:2!null "?column?":22!null "?column?":30!null
│ │ ├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-more)
│ │ ├── fd: ()-->(2,22,30), (2)==(22,30), (30)==(2,22), (22)==(2,30)
│ │ ├── inner-join (hash)
│ │ │ ├── columns: t1.i:2!null "?column?":30!null
│ │ │ ├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-more)
│ │ │ ├── fd: ()-->(2,30), (2)==(30), (30)==(2)
│ │ │ ├── scan a [as=t1]
│ │ │ │ └── columns: t1.i:2
│ │ │ ├── max1-row
│ │ │ │ ├── columns: "?column?":30!null
│ │ │ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ │ │ ├── cardinality: [0 - 1]
│ │ │ │ ├── key: ()
│ │ │ │ ├── fd: ()-->(30)
│ │ │ │ └── project
│ │ │ │ ├── columns: "?column?":30!null
│ │ │ │ ├── fd: ()-->(30)
│ │ │ │ ├── scan a
│ │ │ │ └── projections
│ │ │ │ └── 0 [as="?column?":30]
│ │ │ └── filters
│ │ │ └── t1.i:2 = "?column?":30 [outer=(2,30), constraints=(/2: (/NULL - ]; /30: (/NULL - ]), fd=(2)==(30), (30)==(2)]
│ │ ├── select
│ │ │ ├── columns: "?column?":22!null
│ │ │ ├── cardinality: [0 - 1]
│ │ │ ├── key: ()
│ │ │ ├── fd: ()-->(22)
│ │ │ ├── max1-row
│ │ │ │ ├── columns: "?column?":22!null
│ │ │ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ │ │ ├── cardinality: [0 - 1]
│ │ │ │ ├── key: ()
│ │ │ │ ├── fd: ()-->(22)
│ │ │ │ └── project
│ │ │ │ ├── columns: "?column?":22!null
│ │ │ │ ├── fd: ()-->(22)
│ │ │ │ ├── scan a
│ │ │ │ └── projections
│ │ │ │ └── 0 [as="?column?":22]
│ │ │ └── filters
│ │ │ └── eq [outer=(22), subquery, constraints=(/22: (/NULL - ])]
│ │ │ ├── "?column?":22
│ │ │ └── subquery
│ │ │ └── max1-row
│ │ │ ├── columns: "?column?":30!null
│ │ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ │ ├── cardinality: [0 - 1]
│ │ │ ├── key: ()
│ │ │ ├── fd: ()-->(30)
│ │ │ └── project
│ │ │ ├── columns: "?column?":30!null
│ │ │ ├── fd: ()-->(30)
│ │ │ ├── scan a
│ │ │ └── projections
│ │ │ └── 0 [as="?column?":30]
│ │ └── filters
│ │ └── t1.i:2 = "?column?":22 [outer=(2,22), constraints=(/2: (/NULL - ]; /22: (/NULL - ]), fd=(2)==(22), (22)==(2)]
│ ├── select
│ │ ├── columns: t2.i:9!null
│ │ ├── scan a [as=t2]
│ │ │ └── columns: t2.i:9
│ │ └── filters
│ │ ├── eq [outer=(9), subquery, constraints=(/9: (/NULL - ])]
│ │ │ ├── t2.i:9
│ │ │ └── subquery
│ │ │ └── max1-row
│ │ │ ├── columns: "?column?":22!null
│ │ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ │ ├── cardinality: [0 - 1]
│ │ │ ├── key: ()
│ │ │ ├── fd: ()-->(22)
│ │ │ └── project
│ │ │ ├── columns: "?column?":22!null
│ │ │ ├── fd: ()-->(22)
│ │ │ ├── scan a
│ │ │ └── projections
│ │ │ └── 0 [as="?column?":22]
│ │ └── eq [outer=(9), subquery, constraints=(/9: (/NULL - ])]
│ │ ├── t2.i:9
│ │ └── subquery
│ │ └── max1-row
│ │ ├── columns: "?column?":30!null
│ │ ├── error: "more than one row returned by a subquery used as an expression"
│ │ ├── cardinality: [0 - 1]
│ │ ├── key: ()
│ │ ├── fd: ()-->(30)
│ │ └── project
│ │ ├── columns: "?column?":30!null
│ │ ├── fd: ()-->(30)
│ │ ├── scan a
│ │ └── projections
│ │ └── 0 [as="?column?":30]
│ └── filters
│ └── t1.i:2 = t2.i:9 [outer=(2,9), constraints=(/2: (/NULL - ]; /9: (/NULL - ]), fd=(2)==(9), (9)==(2)]
└── projections
└── NULL [as="?column?":31]


# --------------------------------------------------
# HoistProjectSubquery
Expand Down

0 comments on commit bf39fbf

Please sign in to comment.