Skip to content

Commit

Permalink
opt: don't infer join equalities from equivalences on nullable columns
Browse files Browse the repository at this point in the history
The `JoinOrderBuilder` is able to infer equality filters from the transitive
closure of equalities in the reordered join tree for use in building new
joins. However, it isn't valid to infer this equality for nullable columns.
This is because the SQL `=` operator rejects NULL values, while functional
dependency equivalences allow them. It was not previously possible to
encounter this bug because inference of an "interesting" equality required
an existing equality filter on both of the involved columns, which rejects
NULLs anyway. However, cockroachdb#105214 added the ability to infer equivalences
between self-join columns, which *do* allow nulls. This allowed the bug
to manifest.

This patch adds a check to the equality filter inference logic to ensure
that at least one of the candidate columns is non-null. This works because
if one column is non-null, all equivalent columns are also non-null. This
ensures that a null-rejecting filter isn't incorrectly added to a nullable
column.

Fixes cockroachdb#105608

Release note: None
  • Loading branch information
DrewKimball committed Jul 6, 2023
1 parent dbe8511 commit f342485
Show file tree
Hide file tree
Showing 3 changed files with 271 additions and 0 deletions.
53 changes: 53 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/join
Original file line number Diff line number Diff line change
Expand Up @@ -1152,3 +1152,56 @@ SELECT abcd.*, dxby.* FROM abcd INNER JOIN dxby USING (d, b)
----
a b c d d x b y
2 2 2 2 2 2 2 2

# Regression test for #105608 - don't filter out NULL values from the
# task-patient LEFT join.
statement ok
CREATE TABLE patient (id INT8 NOT NULL, site_id INT8, PRIMARY KEY (id));

statement ok
CREATE TABLE site (id INT8 NOT NULL, PRIMARY KEY (id));

statement ok
CREATE TABLE task (id INT8 NOT NULL, description VARCHAR(255), patient_id INT8, PRIMARY KEY (id));

statement ok
ALTER TABLE IF EXISTS patient ADD CONSTRAINT fkhty4ykfvf29xscswwd63mgdoc FOREIGN KEY (site_id) REFERENCES site;

statement ok
ALTER TABLE IF EXISTS task ADD CONSTRAINT fkkk2ow88d08vqxqyvvvmys1j2m FOREIGN KEY (patient_id) REFERENCES patient;

statement ok
INSERT INTO site(id) VALUES (1);
INSERT INTO site(id) VALUES (2);
INSERT INTO site(id) VALUES (3);

statement ok
INSERT INTO patient(site_id, id) VALUES (NULL, 4);
INSERT INTO patient(site_id, id) VALUES (1, 5);
INSERT INTO patient(site_id, id) VALUES (1, 6);
INSERT INTO patient(site_id, id) VALUES (2, 7);
INSERT INTO patient(site_id, id) VALUES (3, 8);

statement ok
INSERT INTO task(description, patient_id, id) VALUES ('taskWithoutPatient', NULL, 9);
INSERT INTO task(description, patient_id, id) VALUES ('taskWithPatientWithoutSite', 4, 10);
INSERT INTO task(description, patient_id, id) VALUES ('taskWithPatient1WithValidSite1', 5, 11);
INSERT INTO task(description, patient_id, id) VALUES ('taskWithPatient2WithValidSite1', 6, 12);
INSERT INTO task(description, patient_id, id) VALUES ('taskWithPatient3WithValidSite2', 7, 13);
INSERT INTO task(description, patient_id, id) VALUES ('taskWithPatientWithInvalidSite', 8, 14);

query ITI rowsort
SELECT task0_.id AS id1_2_, task0_.description AS descript2_2_, task0_.patient_id AS patient_3_2_
FROM task AS task0_ WHERE EXISTS (
SELECT task1_.id FROM task AS task1_
LEFT JOIN patient AS patient2_
ON task1_.patient_id = patient2_.id
LEFT JOIN site AS site3_
ON patient2_.site_id = site3_.id
WHERE (task1_.id = task0_.id) AND ((patient2_.id IS NULL) OR (site3_.id IN (2, 1)))
);
----
9 taskWithoutPatient NULL
11 taskWithPatient1WithValidSite1 5
12 taskWithPatient2WithValidSite1 6
13 taskWithPatient3WithValidSite2 7
5 changes: 5 additions & 0 deletions pkg/sql/opt/xform/join_order_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,11 @@ func (jb *JoinOrderBuilder) ensureClosure(join memo.RelExpr) {
for col, ok := reps.Next(0); ok; col, ok = reps.Next(col + 1) {
// Get all columns which are known to be equal to this column.
equivGroup := equivFDs.ComputeEquivGroup(col)
if !equivGroup.Intersects(join.Relational().NotNullCols) {
// If the equivalent columns are nullable, it would be invalid to add an
// equality edge because SQL equality rejects NULL values.
continue
}

// Ensure that there exists an edge for each distinct pair of equivalent
// columns.
Expand Down
213 changes: 213 additions & 0 deletions pkg/sql/opt/xform/testdata/rules/join_order
Original file line number Diff line number Diff line change
Expand Up @@ -3156,3 +3156,216 @@ project
│ └── t1.c = t2.c
└── projections
└── 1

# Regression test for #105608 - don't infer an equality on nullable columns from
# the top-level join's functional dependencies.
exec-ddl
CREATE TABLE patient (id INT8 NOT NULL, site_id INT8, PRIMARY KEY (id));
----

exec-ddl
CREATE TABLE site (id INT8 NOT NULL, PRIMARY KEY (id));
----

exec-ddl
CREATE TABLE task (id INT8 NOT NULL, description VARCHAR(255), patient_id INT8, PRIMARY KEY (id));
----

exec-ddl
ALTER TABLE IF EXISTS patient ADD CONSTRAINT fkhty4ykfvf29xscswwd63mgdoc FOREIGN KEY (site_id) REFERENCES site;
----

exec-ddl
ALTER TABLE IF EXISTS task ADD CONSTRAINT fkkk2ow88d08vqxqyvvvmys1j2m FOREIGN KEY (patient_id) REFERENCES patient;
----

# The edge "task0_.patient_id = task1_.patient_id" should not be created.
reorderjoins
SELECT task0_.id AS id1_2_, task0_.description AS descript2_2_, task0_.patient_id AS patient_3_2_
FROM task AS task0_ WHERE EXISTS (
SELECT task1_.id FROM task AS task1_
LEFT JOIN patient AS patient2_
ON task1_.patient_id = patient2_.id
LEFT JOIN site AS site3_
ON patient2_.site_id = site3_.id
WHERE (task1_.id = task0_.id) AND ((patient2_.id IS NULL) OR (site3_.id IN (2, 1)))
);
----
--------------------------------------------------------------------------------
Join Tree #1
--------------------------------------------------------------------------------
left-join (hash)
├── scan task [as=task1_]
├── scan patient [as=patient2_]
└── filters
└── task1_.patient_id = patient2_.id
Vertexes
A:
scan task [as=task1_]
B:
scan patient [as=patient2_]
Edges
task1_.patient_id = patient2_.id [left, ses=AB, tes=AB, rules=()]
Joining AB
A B [left, refs=AB]
Joins Considered: 1
--------------------------------------------------------------------------------
Join Tree #2
--------------------------------------------------------------------------------
left-join (hash)
├── left-join (hash)
│ ├── scan task [as=task1_]
│ ├── scan patient [as=patient2_]
│ └── filters
│ └── task1_.patient_id = patient2_.id
├── scan site [as=site3_]
└── filters
└── site_id = site3_.id
Vertexes
A:
scan task [as=task1_]
B:
scan patient [as=patient2_]
C:
scan site [as=site3_]
Edges
task1_.patient_id = patient2_.id [left, ses=AB, tes=AB, rules=()]
site_id = site3_.id [left, ses=BC, tes=BC, rules=()]
Joining AB
A B [left, refs=AB]
Joining BC
B C [left, refs=BC]
Joining ABC
A BC [left, refs=AB]
AB C [left, refs=BC]
Joins Considered: 4
--------------------------------------------------------------------------------
Join Tree #3
--------------------------------------------------------------------------------
semi-join (hash)
├── scan task [as=task0_]
├── select
│ ├── left-join (hash)
│ │ ├── left-join (hash)
│ │ │ ├── scan task [as=task1_]
│ │ │ ├── scan patient [as=patient2_]
│ │ │ └── filters
│ │ │ └── task1_.patient_id = patient2_.id
│ │ ├── scan site [as=site3_]
│ │ └── filters
│ │ └── site_id = site3_.id
│ └── filters
│ └── (patient2_.id IS NULL) OR (site3_.id IN (1, 2))
└── filters
└── task1_.id = task0_.id
Vertexes
D:
scan task [as=task0_]
E:
select
├── left-join (hash)
│ ├── left-join (hash)
│ │ ├── scan task [as=task1_]
│ │ ├── scan patient [as=patient2_]
│ │ └── filters
│ │ └── task1_.patient_id = patient2_.id
│ ├── scan site [as=site3_]
│ └── filters
│ └── site_id = site3_.id
└── filters
└── (patient2_.id IS NULL) OR (site3_.id IN (1, 2))
Edges
task1_.id = task0_.id [semi, ses=DE, tes=DE, rules=()]
Joining DE
D E [semi, refs=DE]
Joins Considered: 1
--------------------------------------------------------------------------------
Join Tree #4
--------------------------------------------------------------------------------
inner-join (hash)
├── scan task [as=task0_]
├── select
│ ├── left-join (hash)
│ │ ├── left-join (hash)
│ │ │ ├── scan task [as=task1_]
│ │ │ ├── scan patient [as=patient2_]
│ │ │ └── filters
│ │ │ └── task1_.patient_id = patient2_.id
│ │ ├── scan site [as=site3_]
│ │ └── filters
│ │ └── site_id = site3_.id
│ └── filters
│ └── (patient2_.id IS NULL) OR (site3_.id IN (1, 2))
└── filters
└── task1_.id = task0_.id
Vertexes
D:
scan task [as=task0_]
E:
select
├── left-join (hash)
│ ├── left-join (hash)
│ │ ├── scan task [as=task1_]
│ │ ├── scan patient [as=patient2_]
│ │ └── filters
│ │ └── task1_.patient_id = patient2_.id
│ ├── scan site [as=site3_]
│ └── filters
│ └── site_id = site3_.id
└── filters
└── (patient2_.id IS NULL) OR (site3_.id IN (1, 2))
Edges
task1_.id = task0_.id [inner, ses=DE, tes=DE, rules=()]
Joining DE
D E [inner, refs=DE]
E D [inner, refs=DE]
Joins Considered: 2
================================================================================
Final Plan
================================================================================
project
├── columns: id1_2_:1!null descript2_2_:2 patient_3_2_:3
├── key: (1)
├── fd: (1)-->(2,3)
└── project
├── columns: task0_.id:1!null task0_.description:2 task0_.patient_id:3 task1_.id:6!null
├── key: (6)
├── fd: (1)-->(2,3), (6)-->(3), (1)==(6), (6)==(1)
└── inner-join (lookup task [as=task0_])
├── columns: task0_.id:1!null task0_.description:2 task0_.patient_id:3 task1_.id:6!null task1_.patient_id:8 patient2_.id:11 site_id:12 site3_.id:15
├── key columns: [6] = [1]
├── lookup columns are key
├── key: (6)
├── fd: (1)-->(2,3), (6)-->(8,11,12,15), (11)-->(12), (1)==(6), (6)==(1), (3)==(8), (8)==(3)
├── select
│ ├── columns: task1_.id:6!null task1_.patient_id:8 patient2_.id:11 site_id:12 site3_.id:15
│ ├── key: (6)
│ ├── fd: (6)-->(8,11,12,15), (11)-->(12)
│ ├── left-join (hash)
│ │ ├── columns: task1_.id:6!null task1_.patient_id:8 patient2_.id:11 site_id:12 site3_.id:15
│ │ ├── multiplicity: left-rows(exactly-one), right-rows(zero-or-more)
│ │ ├── key: (6)
│ │ ├── fd: (6)-->(8,11,12,15), (11)-->(12)
│ │ ├── left-join (hash)
│ │ │ ├── columns: task1_.id:6!null task1_.patient_id:8 patient2_.id:11 site_id:12
│ │ │ ├── multiplicity: left-rows(exactly-one), right-rows(zero-or-more)
│ │ │ ├── key: (6)
│ │ │ ├── fd: (6)-->(8,11,12), (11)-->(12)
│ │ │ ├── scan task [as=task1_]
│ │ │ │ ├── columns: task1_.id:6!null task1_.patient_id:8
│ │ │ │ ├── key: (6)
│ │ │ │ └── fd: (6)-->(8)
│ │ │ ├── scan patient [as=patient2_]
│ │ │ │ ├── columns: patient2_.id:11!null site_id:12
│ │ │ │ ├── key: (11)
│ │ │ │ └── fd: (11)-->(12)
│ │ │ └── filters
│ │ │ └── task1_.patient_id:8 = patient2_.id:11 [outer=(8,11), constraints=(/8: (/NULL - ]; /11: (/NULL - ]), fd=(8)==(11), (11)==(8)]
│ │ ├── scan site [as=site3_]
│ │ │ ├── columns: site3_.id:15!null
│ │ │ └── key: (15)
│ │ └── filters
│ │ └── site_id:12 = site3_.id:15 [outer=(12,15), constraints=(/12: (/NULL - ]; /15: (/NULL - ]), fd=(12)==(15), (15)==(12)]
│ └── filters
│ └── (patient2_.id:11 IS NULL) OR (site3_.id:15 IN (1, 2)) [outer=(11,15)]
└── filters (true)

0 comments on commit f342485

Please sign in to comment.