Skip to content

Commit

Permalink
opt: move self-join equality inference to join elimination rules
Browse files Browse the repository at this point in the history
PR cockroachdb#105214 added logic to infer equivalences for a join that is performing
a self-join with an equality between the same columns, and those columns
form a key over both inputs. This missed the fact that, for nullable columns
from the self-join table, the inferred equivalences are actually *lax*, as
opposed to strict.

A strict equivalence captures the behavior of the SQL `=` operator; strictly
equivalent columns have the same value for every row and no NULL values.
A lax equivalence is similar, but without the restriction on NULL values.
If one column is NULL for a given row, the other must still be NULL at that
row.

Adding all the inferred equivalences as strict could cause other rules that
depend on functional dependencies to perform incorrect transformations; e.g.
adding an invalid filter. This patch moves the equivalence-inference logic
to the join-elimination rules. This is valid because column remapping only
cares about the "col1 and col2 have the same values for every row" semantics
of lax equivalences. If some of those values are NULL, the remapping is
still valid. This ensures that the inferred lax equivalences are only seen
by the join-elimination rules, which know how to handle them.

Fixes cockroachdb#105608

Release note: None
  • Loading branch information
DrewKimball committed Jun 29, 2023
1 parent fa47de0 commit 845db73
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 306 deletions.
148 changes: 100 additions & 48 deletions pkg/sql/opt/exec/execbuilder/testdata/sql_activity_stats_compaction

Large diffs are not rendered by default.

64 changes: 0 additions & 64 deletions pkg/sql/opt/memo/logical_props_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/intsets"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)
Expand Down Expand Up @@ -2514,69 +2513,6 @@ func (h *joinPropsHelper) setFuncDeps(rel *props.Relational) {
// created new possibilities for simplifying removed columns.
rel.FuncDeps.ProjectCols(rel.OutputCols)
}
h.addSelfJoinImpliedFDs(rel)
}

// addSelfJoinImpliedFDs adds any extra equality FDs that are implied by a self
// join equality between key columns on a table.
func (h *joinPropsHelper) addSelfJoinImpliedFDs(rel *props.Relational) {
md := h.join.Memo().Metadata()
leftCols, rightCols := h.leftProps.OutputCols, h.rightProps.OutputCols
if !rel.FuncDeps.ComputeEquivClosure(leftCols).Intersects(rightCols) {
// There are no equalities between left and right columns.
return
}
// Map from the table ID to the column ordinals within the table.
getTables := func(cols opt.ColSet) map[opt.TableID]intsets.Fast {
var tables map[opt.TableID]intsets.Fast
cols.ForEach(func(col opt.ColumnID) {
if tab := md.ColumnMeta(col).Table; tab != opt.TableID(0) {
if tables == nil {
tables = make(map[opt.TableID]intsets.Fast)
}
colOrds := tables[tab]
colOrds.Add(tab.ColumnOrdinal(col))
tables[tab] = colOrds
}
})
return tables
}
leftTables := getTables(leftCols)
if leftTables == nil {
return
}
rightTables := getTables(rightCols)
if rightTables == nil {
return
}
for leftTable, leftTableOrds := range leftTables {
for rightTable, rightTableOrds := range rightTables {
if md.TableMeta(leftTable).Table.ID() != md.TableMeta(rightTable).Table.ID() {
continue
}
// This is a self-join. If there are equalities between columns at the
// same ordinal positions in each (meta) table and those columns form a
// key on each input, *every* pair of columns at the same ordinal position
// is equal.
var eqCols opt.ColSet
colOrds := leftTableOrds.Intersection(rightTableOrds)
for colOrd, ok := colOrds.Next(0); ok; colOrd, ok = colOrds.Next(colOrd + 1) {
leftCol, rightCol := leftTable.ColumnID(colOrd), rightTable.ColumnID(colOrd)
if rel.FuncDeps.AreColsEquiv(leftCol, rightCol) {
eqCols.Add(leftCol)
eqCols.Add(rightCol)
}
}
if !eqCols.Empty() && h.leftProps.FuncDeps.ColsAreStrictKey(eqCols) &&
h.rightProps.FuncDeps.ColsAreStrictKey(eqCols) {
// Add equalities between each pair of columns at the same ordinal
// position, ignoring those that aren't part of the output.
for colOrd, ok := colOrds.Next(0); ok; colOrd, ok = colOrds.Next(colOrd + 1) {
rel.FuncDeps.AddEquivalency(leftTable.ColumnID(colOrd), rightTable.ColumnID(colOrd))
}
}
}
}
}

func (h *joinPropsHelper) cardinality() props.Cardinality {
Expand Down
208 changes: 47 additions & 161 deletions pkg/sql/opt/memo/testdata/logprops/join
Original file line number Diff line number Diff line change
Expand Up @@ -2254,7 +2254,7 @@ inner-join (hash)
├── columns: x:1(int!null) y:2(int) s:3(string) d:4(decimal!null) x:7(int!null) y:8(int) s:9(string) d:10(decimal!null)
├── multiplicity: left-rows(exactly-one), right-rows(exactly-one)
├── key: (7)
├── fd: (1)-->(2-4), (3,4)~~>(1,2), (7)-->(8-10), (9,10)~~>(7,8), (1)==(7), (7)==(1), (2)==(8), (8)==(2), (3)==(9), (9)==(3), (4)==(10), (10)==(4)
├── fd: (1)-->(2-4), (3,4)~~>(1,2), (7)-->(8-10), (9,10)~~>(7,8), (1)==(7), (7)==(1)
├── prune: (2-4,8-10)
├── interesting orderings: (+1) (-3,+4,+1) (+7) (-9,+10,+7)
├── scan xysd
Expand Down Expand Up @@ -2287,7 +2287,7 @@ inner-join (hash)
├── cardinality: [0 - 1]
├── multiplicity: left-rows(exactly-one), right-rows(exactly-one)
├── key: ()
├── fd: ()-->(1-4,7-10), (7)==(1), (1)==(7), (2)==(8), (8)==(2), (3)==(9), (9)==(3), (4)==(10), (10)==(4)
├── fd: ()-->(1-4,7-10), (7)==(1), (1)==(7)
├── prune: (2-4,8-10)
├── select
│ ├── columns: xysd.x:1(int!null) xysd.y:2(int) xysd.s:3(string) xysd.d:4(decimal!null)
Expand Down Expand Up @@ -2384,7 +2384,7 @@ inner-join (hash)
├── columns: k:1(int!null) v:2(int) r1:3(int!null) r2:4(int) x:7(int!null) y:8(int) s:9(string) d:10(decimal!null) x:13(int!null) y:14(int) s:15(string) d:16(decimal!null)
├── multiplicity: left-rows(exactly-one), right-rows(zero-or-more)
├── key: (1)
├── fd: (1)-->(2-4), (7)-->(8-10), (9,10)~~>(7,8), (13)-->(14-16), (15,16)~~>(13,14), (7)==(3,13), (13)==(3,7), (8)==(14), (14)==(8), (9)==(15), (15)==(9), (10)==(16), (16)==(10), (3)==(7,13)
├── fd: (1)-->(2-4), (7)-->(8-10), (9,10)~~>(7,8), (13)-->(14-16), (15,16)~~>(13,14), (7)==(3,13), (13)==(3,7), (3)==(7,13)
├── prune: (1,2,4,8-10,14-16)
├── interesting orderings: (+1) (+7) (-9,+10,+7) (+13) (-15,+16,+13)
├── scan fk
Expand All @@ -2398,7 +2398,7 @@ inner-join (hash)
│ ├── columns: xysd.x:7(int!null) xysd.y:8(int) xysd.s:9(string) xysd.d:10(decimal!null) a.x:13(int!null) a.y:14(int) a.s:15(string) a.d:16(decimal!null)
│ ├── multiplicity: left-rows(exactly-one), right-rows(exactly-one)
│ ├── key: (13)
│ ├── fd: (7)-->(8-10), (9,10)~~>(7,8), (13)-->(14-16), (15,16)~~>(13,14), (7)==(13), (13)==(7), (8)==(14), (14)==(8), (9)==(15), (15)==(9), (10)==(16), (16)==(10)
│ ├── fd: (7)-->(8-10), (9,10)~~>(7,8), (13)-->(14-16), (15,16)~~>(13,14), (7)==(13), (13)==(7)
│ ├── prune: (8-10,14-16)
│ ├── interesting orderings: (+7) (-9,+10,+7) (+13) (-15,+16,+13)
│ ├── unfiltered-cols: (7-18)
Expand Down Expand Up @@ -2888,167 +2888,53 @@ left-join (hash)
├── variable: r_b:2 [type=int]
└── variable: b:7 [type=int]

# It is possible to infer equality filters for a self-join on key columns.
norm disable=(EliminateJoinUnderProjectLeft,EliminateJoinUnderProjectRight)
SELECT * FROM xyz INNER JOIN xyz foo ON xyz.x = foo.x AND xyz.y = foo.y
# Regression test for #105608 - don't add an (incorrectly) inferred equality
# between nullable self-join columns.
exec-ddl
CREATE TABLE patient (id INT8 NOT NULL, site_id INT8, PRIMARY KEY (id));
----
inner-join (hash)
├── columns: x:1(int!null) y:2(int!null) z:3(int) x:6(int!null) y:7(int!null) z:8(int)
├── multiplicity: left-rows(exactly-one), right-rows(exactly-one)
├── key: (6,7)
├── fd: (1,2)-->(3), (6,7)-->(8), (1)==(6), (6)==(1), (2)==(7), (7)==(2), (3)==(8), (8)==(3)
├── prune: (3,8)
├── interesting orderings: (+1,+2) (+6,+7)
├── scan xyz
│ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ ├── key: (1,2)
│ ├── fd: (1,2)-->(3)
│ ├── prune: (1-3)
│ ├── interesting orderings: (+1,+2)
│ └── unfiltered-cols: (1-5)
├── scan xyz [as=foo]
│ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ ├── key: (6,7)
│ ├── fd: (6,7)-->(8)
│ ├── prune: (6-8)
│ ├── interesting orderings: (+6,+7)
│ └── unfiltered-cols: (6-10)
└── filters
├── eq [type=bool, outer=(1,6), constraints=(/1: (/NULL - ]; /6: (/NULL - ]), fd=(1)==(6), (6)==(1)]
│ ├── variable: xyz.x:1 [type=int]
│ └── variable: foo.x:6 [type=int]
└── eq [type=bool, outer=(2,7), constraints=(/2: (/NULL - ]; /7: (/NULL - ]), fd=(2)==(7), (7)==(2)]
├── variable: xyz.y:2 [type=int]
└── variable: foo.y:7 [type=int]

# Self-join filters cannot be inferred if not joining on a key.
norm
SELECT * FROM xyz INNER JOIN xyz foo ON xyz.x = foo.x
exec-ddl
CREATE TABLE site (id INT8 NOT NULL, PRIMARY KEY (id));
----
inner-join (hash)
├── columns: x:1(int!null) y:2(int!null) z:3(int) x:6(int!null) y:7(int!null) z:8(int)
├── multiplicity: left-rows(one-or-more), right-rows(one-or-more)
├── key: (2,6,7)
├── fd: (1,2)-->(3), (6,7)-->(8), (1)==(6), (6)==(1)
├── prune: (2,3,7,8)
├── interesting orderings: (+1,+2) (+6,+7)
├── scan xyz
│ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ ├── key: (1,2)
│ ├── fd: (1,2)-->(3)
│ ├── prune: (1-3)
│ ├── interesting orderings: (+1,+2)
│ └── unfiltered-cols: (1-5)
├── scan xyz [as=foo]
│ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ ├── key: (6,7)
│ ├── fd: (6,7)-->(8)
│ ├── prune: (6-8)
│ ├── interesting orderings: (+6,+7)
│ └── unfiltered-cols: (6-10)
└── filters
└── eq [type=bool, outer=(1,6), constraints=(/1: (/NULL - ]; /6: (/NULL - ]), fd=(1)==(6), (6)==(1)]
├── variable: xyz.x:1 [type=int]
└── variable: foo.x:6 [type=int]

# Self-join filters can be inferred even if the join key wasn't a key in the
# base table.
norm
SELECT * FROM xyz INNER JOIN xyz foo ON xyz.x = foo.x AND xyz.y = 1 AND foo.y = 1
exec-ddl
CREATE TABLE task (id INT8 NOT NULL, description VARCHAR(255), patient_id INT8, PRIMARY KEY (id));
----
inner-join (hash)
├── columns: x:1(int!null) y:2(int!null) z:3(int) x:6(int!null) y:7(int!null) z:8(int)
├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-one)
├── key: (6)
├── fd: ()-->(2,7), (1)-->(3), (6)-->(8), (1)==(6), (6)==(1), (2)==(7), (7)==(2), (3)==(8), (8)==(3)
├── prune: (3,8)
├── interesting orderings: (+1 opt(2)) (+6 opt(7))
├── select
│ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ ├── key: (1)
│ ├── fd: ()-->(2), (1)-->(3)
│ ├── prune: (1,3)
│ ├── interesting orderings: (+1 opt(2))
│ ├── scan xyz
│ │ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ │ ├── key: (1,2)
│ │ ├── fd: (1,2)-->(3)
│ │ ├── prune: (1-3)
│ │ ├── interesting orderings: (+1,+2)
│ │ └── unfiltered-cols: (1-5)
│ └── filters
│ └── eq [type=bool, outer=(2), constraints=(/2: [/1 - /1]; tight), fd=()-->(2)]
│ ├── variable: xyz.y:2 [type=int]
│ └── const: 1 [type=int]
├── select
│ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ ├── key: (6)
│ ├── fd: ()-->(7), (6)-->(8)
│ ├── prune: (6,8)
│ ├── interesting orderings: (+6 opt(7))
│ ├── scan xyz [as=foo]
│ │ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ │ ├── key: (6,7)
│ │ ├── fd: (6,7)-->(8)
│ │ ├── prune: (6-8)
│ │ ├── interesting orderings: (+6,+7)
│ │ └── unfiltered-cols: (6-10)
│ └── filters
│ └── eq [type=bool, outer=(7), constraints=(/7: [/1 - /1]; tight), fd=()-->(7)]
│ ├── variable: foo.y:7 [type=int]
│ └── const: 1 [type=int]
└── filters
└── eq [type=bool, outer=(1,6), constraints=(/1: (/NULL - ]; /6: (/NULL - ]), fd=(1)==(6), (6)==(1)]
├── variable: xyz.x:1 [type=int]
└── variable: foo.x:6 [type=int]

# The optimizer doesn't detect the contradiction here because the "y" filters
# pushed down before the join properties are calculated.
norm
SELECT * FROM xyz INNER JOIN xyz foo ON xyz.x = foo.x AND xyz.y = 1 AND foo.y = 2
exec-ddl
ALTER TABLE IF EXISTS patient ADD CONSTRAINT fkhty4ykfvf29xscswwd63mgdoc FOREIGN KEY (site_id) REFERENCES site;
----
inner-join (hash)
├── columns: x:1(int!null) y:2(int!null) z:3(int) x:6(int!null) y:7(int!null) z:8(int)
├── multiplicity: left-rows(zero-or-one), right-rows(zero-or-one)
├── key: (6)
├── fd: ()-->(2,7), (1)-->(3), (6)-->(8), (1)==(6), (6)==(1), (2)==(7), (7)==(2), (3)==(8), (8)==(3)
├── prune: (3,8)
├── interesting orderings: (+1 opt(2)) (+6 opt(7))
├── select
│ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ ├── key: (1)
│ ├── fd: ()-->(2), (1)-->(3)
│ ├── prune: (1,3)
│ ├── interesting orderings: (+1 opt(2))
│ ├── scan xyz
│ │ ├── columns: xyz.x:1(int!null) xyz.y:2(int!null) xyz.z:3(int)
│ │ ├── key: (1,2)
│ │ ├── fd: (1,2)-->(3)
│ │ ├── prune: (1-3)
│ │ ├── interesting orderings: (+1,+2)
│ │ └── unfiltered-cols: (1-5)
│ └── filters
│ └── eq [type=bool, outer=(2), constraints=(/2: [/1 - /1]; tight), fd=()-->(2)]
│ ├── variable: xyz.y:2 [type=int]
│ └── const: 1 [type=int]
├── select
│ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ ├── key: (6)
│ ├── fd: ()-->(7), (6)-->(8)
│ ├── prune: (6,8)
│ ├── interesting orderings: (+6 opt(7))
│ ├── scan xyz [as=foo]
│ │ ├── columns: foo.x:6(int!null) foo.y:7(int!null) foo.z:8(int)
│ │ ├── key: (6,7)
│ │ ├── fd: (6,7)-->(8)
│ │ ├── prune: (6-8)
│ │ ├── interesting orderings: (+6,+7)
│ │ └── unfiltered-cols: (6-10)
│ └── filters
│ └── eq [type=bool, outer=(7), constraints=(/7: [/2 - /2]; tight), fd=()-->(7)]
│ ├── variable: foo.y:7 [type=int]
│ └── const: 2 [type=int]
└── filters
└── eq [type=bool, outer=(1,6), constraints=(/1: (/NULL - ]; /6: (/NULL - ]), fd=(1)==(6), (6)==(1)]
├── variable: xyz.x:1 [type=int]
└── variable: foo.x:6 [type=int]

exec-ddl
ALTER TABLE IF EXISTS task ADD CONSTRAINT fkkk2ow88d08vqxqyvvvmys1j2m FOREIGN KEY (patient_id) REFERENCES patient;
----

opt format=hide-all
SELECT task0_.id AS id1_2_, task0_.description AS descript2_2_, task0_.patient_id AS patient_3_2_
FROM task AS task0_ WHERE EXISTS (
SELECT task1_.id FROM task AS task1_
LEFT JOIN patient AS patient2_
ON task1_.patient_id = patient2_.id
LEFT JOIN site AS site3_
ON patient2_.site_id = site3_.id
WHERE (task1_.id = task0_.id) AND ((patient2_.id IS NULL) OR (site3_.id IN (2, 1)))
);
----
project
└── project
└── inner-join (lookup task [as=task0_])
├── lookup columns are key
├── select
│ ├── left-join (hash)
│ │ ├── left-join (hash)
│ │ │ ├── scan task [as=task1_]
│ │ │ ├── scan patient [as=patient2_]
│ │ │ └── filters
│ │ │ └── task1_.patient_id = patient2_.id
│ │ ├── scan site [as=site3_]
│ │ └── filters
│ │ └── site_id = site3_.id
│ └── filters
│ └── (patient2_.id IS NULL) OR (site3_.id IN (1, 2))
└── filters (true)
12 changes: 0 additions & 12 deletions pkg/sql/opt/norm/general_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,18 +305,6 @@ func (c *CustomFuncs) MakeBoolCol() opt.ColumnID {
return c.mem.Metadata().AddColumn("", types.Bool)
}

// CanRemapCols returns true if it's possible to remap every column in the
// "from" set to a column in the "to" set using the given FDs.
func (c *CustomFuncs) CanRemapCols(from, to opt.ColSet, fds *props.FuncDepSet) bool {
for col, ok := from.Next(0); ok; col, ok = from.Next(col + 1) {
if !fds.ComputeEquivGroup(col).Intersects(to) {
// It is not possible to remap this column to one from the "to" set.
return false
}
}
return true
}

// ----------------------------------------------------------------------
//
// Outer column functions
Expand Down
Loading

0 comments on commit 845db73

Please sign in to comment.