Skip to content

Commit

Permalink
opt: Hoist Exists operator and try to decorrelate
Browse files Browse the repository at this point in the history
Transform EXISTS clause in WHERE clauses into a SemiJoinApply or
AntiSemiJoinApply operator. Additional rules will attempt to decorrelate
the right operand of the Apply by pushing the Apply down through any Select
operator. Example:

  SELECT * FROM a WHERE EXISTS(SELECT * FROM b WHERE a.x=b.x)
  =>
  SELECT * FROM a SEMI JOIN APPLY (SELECT * FROM b WHERE a.x=b.x)
  =>
  SELECT * FROM a SEMI JOIN b WHERE a.x=b.x

Release note: None
  • Loading branch information
andy-kimball committed Apr 24, 2018
1 parent b3c4d9e commit d3c6e99
Show file tree
Hide file tree
Showing 12 changed files with 1,137 additions and 153 deletions.
3 changes: 1 addition & 2 deletions pkg/sql/opt/memo/logical_props_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ func (f logicalPropsFactory) constructGroupByProps(ev ExprView) LogicalProps {

// Any outer columns from aggregation expressions that are not bound by the
// input columns are outer columns.
props.Relational.OuterCols = aggProps.OuterCols.Copy()
props.Relational.OuterCols.DifferenceWith(inputProps.OutputCols)
props.Relational.OuterCols = aggProps.OuterCols.Difference(inputProps.OutputCols)
props.Relational.OuterCols.UnionWith(inputProps.OuterCols)

// Scalar group by has no grouping columns and always a single row.
Expand Down
130 changes: 92 additions & 38 deletions pkg/sql/opt/norm/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,25 @@ func (f *Factory) listOnlyHasNulls(list memo.ListID) bool {
return true
}

// removeListItem returns a new list that is a copy of the given list, except
// that it does not contain the given search item. If the list contains the item
// multiple times, then only the first instance is removed. If the list does not
// contain the item, then removeListItem will panic.
func (f *Factory) removeListItem(list memo.ListID, search memo.GroupID) memo.ListID {
existingList := f.mem.LookupList(list)
newList := make([]memo.GroupID, len(existingList)-1)
for i, item := range existingList {
if item == search {
newList = append(newList[:i], existingList[i+1:]...)
break
}

// If the list does not contain the item, this will panic.
newList[i] = item
}
return f.mem.InternList(newList)
}

// isSortedUniqueList returns true if the list is in sorted order, with no
// duplicates. See the comment for listSorter.compare for comparison rule
// details.
Expand Down Expand Up @@ -336,14 +355,17 @@ func (f *Factory) outerCols(group memo.GroupID) opt.ColSet {
return f.lookupLogical(group).OuterCols()
}

// synthesizedCols returns the set of columns which have been added by the given
// Project operator to its input columns. For example, the "x+1" column is a
// synthesized column in "SELECT x, x+1 FROM a".
func (f *Factory) synthesizedCols(project memo.GroupID) opt.ColSet {
synth := f.outputCols(project).Copy()
input := f.mem.NormExpr(project).AsProject().Input()
synth.DifferenceWith(f.outputCols(input))
return synth
// hasOuterCols returns true if the given group has at least one outer column,
// or in other words, a reference to a variable that is not bound within its
// own scope. For example:
//
// SELECT * FROM a WHERE EXISTS(SELECT * FROM b WHERE b.x = a.x)
//
// The a.x variable in the EXISTS subquery references a column outside the scope
// of the subquery. It is an "outer column" for the subquery (see the comment on
// RelationalProps.OuterCols for more details).
func (f *Factory) hasOuterCols(group memo.GroupID) bool {
return !f.outerCols(group).Empty()
}

// onlyConstants returns true if the scalar expression is a "constant
Expand Down Expand Up @@ -371,6 +393,13 @@ func (f *Factory) hasSubsetCols(left, right memo.GroupID) bool {
return f.outputCols(left).SubsetOf(f.outputCols(right))
}

// isScalarGroupBy returns true if the given grouping columns come from a
// "scalar" GroupBy operator. A scalar GroupBy always returns exactly one row,
// with any aggregate functions operating over the entire input expression.
func (f *Factory) isScalarGroupBy(groupingCols memo.PrivateID) bool {
return f.mem.LookupPrivate(groupingCols).(opt.ColSet).Empty()
}

// ----------------------------------------------------------------------
//
// Project Rules
Expand Down Expand Up @@ -578,14 +607,8 @@ func (f *Factory) offsetNoCycle(input, limit memo.GroupID, ordering memo.Private
//
// ----------------------------------------------------------------------

// emptyGroupingCols returns true if the given grouping columns for a GroupBy
// operator are empty.
func (f *Factory) emptyGroupingCols(cols memo.PrivateID) bool {
return f.mem.LookupPrivate(cols).(opt.ColSet).Empty()
}

// isCorrelated returns true if variables in the source expression reference
// columns in the destination expression. For example:
// isCorrelated returns true if any variable in the source expression references
// a column from the destination expression. For example:
// (InnerJoin
// (Scan a)
// (Scan b)
Expand All @@ -599,22 +622,25 @@ func (f *Factory) isCorrelated(src, dst memo.GroupID) bool {
return f.outerCols(src).Intersects(f.outputCols(dst))
}

// isCorrelatedCols is similar to isCorrelated, except that it checks whether
// variables in the given expression reference any of the given columns. This:
// isBoundBy returns true if all outer references in the source expression are
// bound by the destination expression. For example:
//
// (IsCorrelated $src $dst)
//
// is equivalent to this:
//
// (IsCorrelatedCols $src (OutputCols $dts))
// (InnerJoin
// (Scan a)
// (Scan b)
// (Eq (Variable a.x) (Const 1))
// )
//
func (f *Factory) isCorrelatedCols(group memo.GroupID, cols opt.ColSet) bool {
return f.outerCols(group).Intersects(cols)
// The (Eq) expression is fully bound by the (Scan a) expression because all of
// its outer references are satisfied by the columns produced by the Scan.
func (f *Factory) isBoundBy(src, dst memo.GroupID) bool {
return f.outerCols(src).SubsetOf(f.outputCols(dst))
}

// extractCorrelatedConditions returns a new list containing only those
// expressions from the given list that are correlated with the given set of
// columns. For example:
// extractBoundConditions returns a new list containing only those expressions
// from the given list that are fully bound by the given expression (i.e. all
// outer references are satisfied by it). For example:
//
// (InnerJoin
// (Scan a)
// (Scan b)
Expand All @@ -624,26 +650,27 @@ func (f *Factory) isCorrelatedCols(group memo.GroupID, cols opt.ColSet) bool {
// ])
// )
//
// Calling extractCorrelatedConditions with the filter conditions list and the
// output columns of (Scan b) would extract the (Eq) expression, since it
// references columns from b.
func (f *Factory) extractCorrelatedConditions(list memo.ListID, cols opt.ColSet) memo.ListID {
// Calling extractBoundConditions with the filter conditions list and the output
// columns of (Scan a) would extract the (Gt) expression, since its outer
// references only reference columns from a.
func (f *Factory) extractBoundConditions(list memo.ListID, group memo.GroupID) memo.ListID {
extracted := make([]memo.GroupID, 0, list.Length)
for _, item := range f.mem.LookupList(list) {
if f.isCorrelatedCols(item, cols) {
if f.isBoundBy(item, group) {
extracted = append(extracted, item)
}
}
return f.mem.InternList(extracted)
}

// extractUncorrelatedConditions is the inverse of extractCorrelatedConditions.
// Instead of extracting correlated expressions, it extracts list expressions
// that are *not* correlated with the destination.
func (f *Factory) extractUncorrelatedConditions(list memo.ListID, cols opt.ColSet) memo.ListID {
// extractUnboundConditions is the inverse of extractBoundConditions. Instead of
// extracting expressions that are bound by the given expression, it extracts
// list expressions that have at least one outer reference that is *not* bound
// by the given expression (i.e. it has a "free" variable).
func (f *Factory) extractUnboundConditions(list memo.ListID, group memo.GroupID) memo.ListID {
extracted := make([]memo.GroupID, 0, list.Length)
for _, item := range f.mem.LookupList(list) {
if !f.isCorrelatedCols(item, cols) {
if !f.isBoundBy(item, group) {
extracted = append(extracted, item)
}
}
Expand Down Expand Up @@ -713,6 +740,33 @@ func (f *Factory) colsAreKey(cols memo.PrivateID, group memo.GroupID) bool {
return false
}

// ----------------------------------------------------------------------
//
// Join Rules
// Custom match and replace functions used with join.opt rules.
//
// ----------------------------------------------------------------------

// removeApply replaces an apply join operator type with the corresponding non-
// apply join operator type. This is used when decorrelating subqueries.
func (f *Factory) removeApply(op opt.Operator, left, right, filter memo.GroupID) memo.GroupID {
switch op {
case opt.InnerJoinApplyOp:
return f.ConstructInnerJoin(left, right, filter)
case opt.LeftJoinApplyOp:
return f.ConstructLeftJoin(left, right, filter)
case opt.RightJoinApplyOp:
return f.ConstructRightJoin(left, right, filter)
case opt.FullJoinApplyOp:
return f.ConstructFullJoin(left, right, filter)
case opt.SemiJoinApplyOp:
return f.ConstructSemiJoin(left, right, filter)
case opt.AntiJoinApplyOp:
return f.ConstructAntiJoin(left, right, filter)
}
panic(fmt.Sprintf("unexpected join operator: %v", op))
}

// ----------------------------------------------------------------------
//
// Boolean Rules
Expand Down
10 changes: 8 additions & 2 deletions pkg/sql/opt/norm/rules/citations.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@ further information, and in some cases proofs, can be found.
ACM Trans. Database Syst.. 22. 43-73. 10.1145/244810.244812.
https://www.researchgate.net/publication/220225172_Outerjoin_Simplification_and_Reordering_for_Query_Optimization

[2] M. M. Joshi and C. A. Galindo-Legaria. Properties of the GroupBy/Aggregate
relational operator. Technical report, Microsoft, 2001. MSR-TR-2001-13.
[2] M. M. Joshi and C. A. Galindo-Legaria.
Properties of the GroupBy/Aggregate relational operator.
Technical report, Microsoft, 2001. MSR-TR-2001-13.

[3] Galindo-Legaria, C.A. & Joshi, Milind. (2001).
Orthogonal Optimization of Subqueries and Aggregation.
Sigmod Record. 30. 571-581. 10.1145/375663.375748.
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.563.8492&rep=rep1&type=pdf
80 changes: 72 additions & 8 deletions pkg/sql/opt/norm/rules/join.opt
Original file line number Diff line number Diff line change
Expand Up @@ -49,38 +49,102 @@
# -- But if the filter is incorrectly pushed down, then no row is returned.
# SELECT * FROM (SELECT * FROM a WHERE a.y < 0) a LEFT JOIN b ON a.x=b.x
#
# In addition, AntiJoin is not eligible for this rule, as illustrated by this
# example:
#
# -- A row is returned for a.y=2.
# SELECT * FROM a ANTI JOIN b ON a.y < 0
#
# -- But if the filter is incorrectly pushed down, then no row is returned.
# SELECT * FROM (SELECT * FROM a WHERE a.y < 0) a ANTI JOIN b ON True
#
# Citations: [1]
[PushFilterIntoJoinLeft, Normalize]
(InnerJoin | InnerJoinApply | RightJoin | RightJoinApply
(InnerJoin | InnerJoinApply | RightJoin | RightJoinApply | SemiJoin | SemiJoinApply
$left:*
$right:*
$on:(Filters $list:[ ... $condition:* & ^(IsCorrelated $condition $right) ... ])
$on:(Filters $list:[ ... $condition:* & (IsBoundBy $condition $left) ... ])
)
=>
((OpName)
(Select
$left
(Filters (ExtractUncorrelatedConditions $list (OutputCols $right)))
(Filters (ExtractBoundConditions $list $left))
)
$right
(Filters (ExtractCorrelatedConditions $list (OutputCols $right)))
(Filters (ExtractUnboundConditions $list $left))
)

# PushFilterIntoJoinRight is symmetric with PushFilterIntoJoinLeft. It pushes
# Join filter conditions into the right side of the join rather than into the
# left side. See that rule's comments for more details.
#
# This rule triggers a cycle with the TryDecorrelateSelect rule. That rule has
# the DetectCycle tag to break the cycle.
[PushFilterIntoJoinRight, Normalize]
(InnerJoin | InnerJoinApply | LeftJoin | LeftJoinApply
(InnerJoin | InnerJoinApply | LeftJoin | LeftJoinApply |
SemiJoin | SemiJoinApply | AntiJoin | AntiJoinApply
$left:*
$right:*
$on:(Filters $list:[ ... $condition:* & ^(IsCorrelated $condition $left) ... ])
$on:(Filters $list:[ ... $condition:* & (IsBoundBy $condition $right) ... ])
)
=>
((OpName)
$left
(Select
$right
(Filters (ExtractUncorrelatedConditions $list (OutputCols $left)))
(Filters (ExtractBoundConditions $list $right))
)
(Filters (ExtractCorrelatedConditions $list (OutputCols $left)))
(Filters (ExtractUnboundConditions $list $right))
)

# DecorrelateJoin maps an apply join into the corresponding join without an
# apply if the right side of the join is not correlated with the left side.
# This allows the optimizer to consider additional physical join operators that
# are unable to handle correlated inputs.
#
# NOTE: Keep this before other decorrelation patterns, as if the correlated
# join can be removed first, it avoids unnecessarily matching other
# patterns that only exist to get to this pattern.
#
# Citations: [3]
[DecorrelateJoin, Normalize]
(JoinApply
$left:*
$right:* & ^(IsCorrelated $right $left)
$on:*
)
=>
(RemoveApply (OpName) $left $right $on)

# TryDecorrelateSelect "pushes down" the join apply into the select operator,
# in order to eliminate any correlation between the select filter list and the
# left side of the join, and also to keep "digging" down to find and eliminate
# other unnecessary correlation. Eventually, the hope is to trigger the
# DecorrelateJoin pattern to turn the JoinApply operator into a non-apply Join
# operator.
#
# This rule triggers a cycle with the PushFilterIntoJoinRight rule. Because this
# rule has the DetectCycle tag, it is skipped once a cycle is detected. This
# gives the PushFilterIntoJoinRight rule one last chance to push Select into the
# right input, and therefore causes the normal form to be (Join (Select)) rather
# than (Select (Join)) when the choice is ambiguous.
#
# Note that citation [3] doesn't directly contain this identity, since it
# assumes that the Select will be hoisted above the Join rather than becoming
# part of its On condition. PushFilterIntoJoinRight allows the condition to be
# pushed down, so this rule can correctly pull it up.
#
# Citations: [3]
[TryDecorrelateSelect, Normalize, DetectCycle]
(InnerJoinApply | LeftJoinApply | SemiJoinApply | AntiJoinApply
$left:*
$right:(Select $input:* $filter:*) & (HasOuterCols $right)
$on:*
)
=>
((OpName)
$left
$input
(ConcatFilters $on $filter)
)
21 changes: 21 additions & 0 deletions pkg/sql/opt/norm/rules/scalar.opt
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,24 @@
)
=>
(Null (BoolType))

# EliminateExistsProject discards a Project input to the Exists operator. The
# Project operator never changes the row cardinality of its input, and row
# cardinality is the only thing that Exists cares about, so Project is a no-op.
[EliminateExistsProject, Normalize]
(Exists (Project $input:*)) => (Exists $input)

# EliminateExistsGroupBy discards a non-scalar GroupBy input to the Exists
# operator. While non-scalar GroupBy can change row cardinality, it always
# returns a non-empty set if its input is non-empty. Similarly, if its input is
# empty, then it returns the empty set. Therefore, it's a no-op for Exists.
[EliminateExistsGroupBy, Normalize]
(Exists
(GroupBy
$input:*
*
$groupingCols:* & ^(IsScalarGroupBy $groupingCols)
)
)
=>
(Exists $input)
Loading

0 comments on commit d3c6e99

Please sign in to comment.