Skip to content

Commit

Permalink
opt: allow join elimination rules to remap columns
Browse files Browse the repository at this point in the history
When it can be proven that a join does not add rows to or remove them
from one of its inputs, the other input can often be removed, eliminating
the join. However, this can only be done if the columns from the
eliminated side are not needed.

This patch allows the join elimination rules to remap columns from the
eliminated side to the preserved side of the join, using the join's
functional dependencies. For example:
```
CREATE TABLE xy (x INT PRIMARY KEY, y INT);
CREATE TABLE fk (k INT PRIMARY KEY, v INT NOT NULL, FOREIGN KEY (v) REFERENCES xy (x));

SELECT x, k, v FROM fk INNER JOIN xy ON v = x;
```
In the example above, the join could not previously be eliminated because
the `x` column is required in the output. Now, the `x` column is remapped
to the equivalent `v` column, allowing the join to be removed.

Fixes #102614

Release note (performance improvement): The optimizer can now eliminate
joins in more cases.
  • Loading branch information
DrewKimball committed Jun 26, 2023
1 parent 15d43bb commit a87cfe6
Show file tree
Hide file tree
Showing 22 changed files with 401 additions and 246 deletions.
17 changes: 12 additions & 5 deletions pkg/ccl/logictestccl/testdata/logic_test/as_of
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,17 @@ CREATE TABLE t (
j INT UNIQUE,
k INT,
UNIQUE (k) STORING (j)
)
);
CREATE TABLE t2 (
i INT PRIMARY KEY,
j INT UNIQUE,
k INT,
UNIQUE (k) STORING (j)
);

statement ok
INSERT INTO t VALUES (2)
INSERT INTO t VALUES (2);
INSERT INTO t2 VALUES (2);

statement error pgcode 3D000 pq: database "test" does not exist
SELECT * FROM t AS OF SYSTEM TIME follower_read_timestamp()
Expand Down Expand Up @@ -134,13 +141,13 @@ statement error unimplemented: cannot use bounded staleness for queries that may
SELECT * FROM t AS OF SYSTEM TIME with_min_timestamp(statement_timestamp() - '1ms')

statement error unimplemented: cannot use bounded staleness for MERGE JOIN
SELECT * FROM t AS t1 JOIN t AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_max_staleness('1ms')
SELECT * FROM t AS t1 JOIN t2 AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_max_staleness('1ms')

statement error unimplemented: cannot use bounded staleness for INNER JOIN
SELECT * FROM t AS t1 INNER HASH JOIN t AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_min_timestamp(statement_timestamp() - '1ms')
SELECT * FROM t AS t1 INNER HASH JOIN t2 AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_min_timestamp(statement_timestamp() - '1ms')

statement error unimplemented: cannot use bounded staleness for LOOKUP JOIN
SELECT * FROM t AS t1 LEFT LOOKUP JOIN t AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_max_staleness('1ms')
SELECT * FROM t AS t1 LEFT LOOKUP JOIN t2 AS t2 ON t1.i = t2.i AS OF SYSTEM TIME with_max_staleness('1ms')

statement error unimplemented: cannot use bounded staleness for UNION
SELECT * FROM (SELECT * FROM t UNION SELECT * FROM t) AS OF SYSTEM TIME with_max_staleness('1ms')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,24 @@ CREATE TABLE messages_global (
INDEX msg_idx(message)
) LOCALITY GLOBAL

statement ok
CREATE TABLE messages_global_2 (
account_id INT NOT NULL,
message_id UUID DEFAULT gen_random_uuid(),
message STRING NOT NULL,
PRIMARY KEY (account_id),
INDEX msg_idx(message)
) LOCALITY GLOBAL

statement ok
CREATE TABLE messages_global_3 (
account_id INT NOT NULL,
message_id UUID DEFAULT gen_random_uuid(),
message STRING NOT NULL,
PRIMARY KEY (account_id),
INDEX msg_idx(message)
) LOCALITY GLOBAL

statement ok
CREATE TABLE messages_rbt (
account_id INT NOT NULL,
Expand Down Expand Up @@ -722,32 +740,32 @@ project

# A lookup join with a global table as either input should be allowed.
query TTTTTT retry
SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global g2 ON g1.account_id = g2.account_id
SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global_2 g2 ON g1.account_id = g2.account_id
----

query T retry
EXPLAIN (OPT) SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global g2 ON g1.account_id = g2.account_id
EXPLAIN (OPT) SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global_2 g2 ON g1.account_id = g2.account_id
----
inner-join (lookup messages_global [as=g2])
inner-join (lookup messages_global_2 [as=g2])
├── flags: force lookup join (into right side)
├── lookup columns are key
├── scan messages_global [as=g1]
└── filters (true)

# A join relation with local home region as the left input of lookup join should be allowed.
query TTTTTTTTT retry
SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global g2 ON g1.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global_2 g2 ON g1.account_id = g2.account_id
INNER LOOKUP JOIN messages_global_3 g3 ON g2.account_id = g3.account_id
----

query T retry
EXPLAIN (OPT) SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global g2 ON g1.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
EXPLAIN (OPT) SELECT * FROM messages_global g1 INNER LOOKUP JOIN messages_global_2 g2 ON g1.account_id = g2.account_id
INNER LOOKUP JOIN messages_global_3 g3 ON g2.account_id = g3.account_id
----
inner-join (lookup messages_global [as=g3])
inner-join (lookup messages_global_3 [as=g3])
├── flags: force lookup join (into right side)
├── lookup columns are key
├── inner-join (lookup messages_global [as=g2])
├── inner-join (lookup messages_global_2 [as=g2])
│ ├── flags: force lookup join (into right side)
│ ├── lookup columns are key
│ ├── scan messages_global [as=g1]
Expand All @@ -759,29 +777,29 @@ inner-join (lookup messages_global [as=g3])
retry
statement error pq: Query has no home region\. Try adding a filter on rbr\.crdb_region and/or on key column \(rbr\.account_id\)\. For more information, see https://www.cockroachlabs.com/docs/stable/cost-based-optimizer.html#control-whether-queries-are-limited-to-a-single-region
SELECT * FROM messages_rbr rbr INNER LOOKUP JOIN messages_global g2 ON rbr.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
INNER LOOKUP JOIN messages_global_2 g3 ON g2.account_id = g3.account_id

# The explicit REGIONAL BY ROW AS column name should be used in the error
# message if it differs from the default crdb_region.
retry
statement error pq: Query has no home region\. Try adding a filter on rbr\.crdb_region_alt and/or on key column \(rbr\.account_id\)\. For more information, see https://www.cockroachlabs.com/docs/stable/cost-based-optimizer.html#control-whether-queries-are-limited-to-a-single-region
SELECT * FROM messages_rbr_alt rbr INNER LOOKUP JOIN messages_global g2 ON rbr.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
INNER LOOKUP JOIN messages_global_2 g3 ON g2.account_id = g3.account_id

# A lookup join relation with a left input join relation which uses locality
# optimized scan in one of the tables of the lookup join should be allowed.
query TTTTTTTTT retry
SELECT * FROM (SELECT * FROM messages_rbr LIMIT 1) rbr INNER LOOKUP JOIN
messages_global g2 ON rbr.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
INNER LOOKUP JOIN messages_global_2 g3 ON g2.account_id = g3.account_id
----

query T retry
EXPLAIN (OPT) SELECT * FROM (SELECT * FROM messages_rbr LIMIT 1) rbr INNER LOOKUP JOIN
messages_global g2 ON rbr.account_id = g2.account_id
INNER LOOKUP JOIN messages_global g3 ON g2.account_id = g3.account_id
INNER LOOKUP JOIN messages_global_2 g3 ON g2.account_id = g3.account_id
----
inner-join (lookup messages_global [as=g3])
inner-join (lookup messages_global_2 [as=g3])
├── flags: force lookup join (into right side)
├── lookup columns are key
├── inner-join (lookup messages_global [as=g2])
Expand Down
13 changes: 12 additions & 1 deletion pkg/ccl/partitionccl/scrub_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,18 @@ INSERT INTO db.t VALUES (1, 3), (2, 4);
t.Fatalf("expected 1 index entry, got %d", len(primaryIndexKey))
}

// Add the primary key via the KV API.
// Add the primary key via the KV API. This will overwrite the old primary
// index KV, so no need to perform a Del.
if err := kvDB.Put(context.Background(), primaryIndexKey[0].Key, &primaryIndexKey[0].Value); err != nil {
t.Fatalf("unexpected error: %s", err)
}
oldValues := []tree.Datum{tree.NewDInt(1), tree.NewDInt(3)}
secondaryIndex := tableDesc.PublicNonPrimaryIndexes()[0]
secondaryIndexDelKey, err := rowenc.EncodeSecondaryIndex(
codec, tableDesc, secondaryIndex, colIDtoRowIndex, oldValues, true /* includeEmpty */)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
secondaryIndexKey, err := rowenc.EncodeSecondaryIndex(
codec, tableDesc, secondaryIndex, colIDtoRowIndex, values, true /* includeEmpty */)
if err != nil {
Expand All @@ -247,6 +254,10 @@ INSERT INTO db.t VALUES (1, 3), (2, 4);
if len(secondaryIndexKey) != 1 {
t.Fatalf("expected 1 index entry, got %d. got %#v", len(secondaryIndexKey), secondaryIndexKey)
}
// Delete the old secondary index KV before inserting the new one.
if _, err := kvDB.Del(context.Background(), secondaryIndexDelKey[0].Key); err != nil {
t.Fatalf("unexpected error: %s", err)
}
if err := kvDB.Put(context.Background(), secondaryIndexKey[0].Key, &secondaryIndexKey[0].Value); err != nil {
t.Fatalf("unexpected error: %s", err)
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/colflow/vectorized_flow_planning_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ func TestVectorizedPlanning(t *testing.T) {
// Check that there is no columnarizer-materializer pair on top of the
// root of the execution tree if the root is a wrapped row-execution
// processor.
_, err = conn.ExecContext(ctx, `CREATE TABLE t (id INT PRIMARY KEY)`)
_, err = conn.ExecContext(ctx, `CREATE TABLE t (id INT PRIMARY KEY, val INT)`)
require.NoError(t, err)
rows, err := conn.QueryContext(ctx, `EXPLAIN (VEC, VERBOSE) SELECT * FROM t AS t1 INNER LOOKUP JOIN t AS t2 ON t1.id = t2.id`)
rows, err := conn.QueryContext(ctx, `EXPLAIN (VEC, VERBOSE) SELECT * FROM t AS t1 INNER LOOKUP JOIN t AS t2 ON t1.val = t2.id`)
require.NoError(t, err)
expectedOutput := []string{
"│",
Expand Down
22 changes: 4 additions & 18 deletions pkg/sql/opt/exec/execbuilder/testdata/join
Original file line number Diff line number Diff line change
Expand Up @@ -543,18 +543,11 @@ EXPLAIN SELECT * FROM cards LEFT OUTER JOIN customers ON customers.id = cards.cu
distribution: local
vectorized: true
·
• merge join
│ equality: (cust) = (id)
│ right cols are key
├── • scan
│ missing stats
│ table: cards@cards_cust_idx
│ spans: FULL SCAN
• render
└── • scan
missing stats
table: customers@customers_pkey
table: cards@cards_pkey
spans: FULL SCAN

# Tests for filter propagation through joins.
Expand Down Expand Up @@ -2220,16 +2213,9 @@ EXPLAIN SELECT * FROM cards LEFT OUTER HASH JOIN customers ON customers.id = car
distribution: local
vectorized: true
·
• hash join
│ equality: (cust) = (id)
│ right cols are key
├── • scan
│ missing stats
│ table: cards@cards_pkey
│ spans: FULL SCAN
• render
└── • scan
missing stats
table: customers@customers_pkey
table: cards@cards_pkey
spans: FULL SCAN
17 changes: 9 additions & 8 deletions pkg/sql/opt/exec/execbuilder/testdata/subquery
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
# Uncorrelated subqueries.
# ------------------------------------------------------------------------------
statement ok
CREATE TABLE abc (a INT PRIMARY KEY, b INT, c INT)
CREATE TABLE abc (a INT PRIMARY KEY, b INT, c INT);
CREATE TABLE abc2 (a INT PRIMARY KEY, b INT, c INT)

query T
EXPLAIN ALTER TABLE abc SPLIT AT VALUES ((SELECT 42))
Expand Down Expand Up @@ -179,7 +180,7 @@ vectorized: true

# IN expression transformed into semi-join.
query T
EXPLAIN (VERBOSE) SELECT a FROM abc WHERE a IN (SELECT a FROM abc WHERE b < 0)
EXPLAIN (VERBOSE) SELECT a FROM abc WHERE a IN (SELECT a FROM abc2 WHERE b < 0)
----
distribution: local
vectorized: true
Expand Down Expand Up @@ -209,7 +210,7 @@ vectorized: true
columns: (a, b)
ordering: +a
estimated row count: 1,000 (missing stats)
table: abc@abc_pkey
table: abc2@abc2_pkey
spans: FULL SCAN

query T
Expand Down Expand Up @@ -507,11 +508,11 @@ query T kvtrace
SELECT k, i, CASE WHEN k > 1 THEN (SELECT i FROM corr tmp WHERE k = corr.k-1) ELSE 0 END AS prev_i
FROM corr
----
Scan /Table/110/{1-2}
Scan /Table/110/1/1/0
Scan /Table/110/1/2/0
Scan /Table/110/1/3/0
Scan /Table/110/1/4/0
Scan /Table/111/{1-2}
Scan /Table/111/1/1/0
Scan /Table/111/1/2/0
Scan /Table/111/1/3/0
Scan /Table/111/1/4/0

# Case where the EXISTS subquery in a filter cannot be hoisted into an
# apply-join.
Expand Down
36 changes: 15 additions & 21 deletions pkg/sql/opt/exec/execbuilder/testdata/upsert
Original file line number Diff line number Diff line change
Expand Up @@ -107,30 +107,24 @@ vectorized: true
│ auto commit
│ arbiter indexes: kv_pkey
└── • lookup join (inner)
└── • render
│ columns: (k, v_default, k)
│ estimated row count: 2 (missing stats)
│ table: kv@kv_pkey
│ equality: (k) = (k)
│ equality cols are key
│ render k: k
│ render v_default: CAST(NULL AS INT8)
│ render k: k
└── • render
│ columns: (v_default, k)
│ render v_default: CAST(NULL AS INT8)
│ render k: k
└── • top-k
│ columns: (k, v)
│ ordering: -v
│ estimated row count: 2 (missing stats)
│ order: -v
│ k: 2
└── • top-k
│ columns: (k, v)
│ ordering: -v
│ estimated row count: 2 (missing stats)
│ order: -v
│ k: 2
└── • scan
columns: (k, v)
estimated row count: 1,000 (missing stats)
table: kv@kv_pkey
spans: FULL SCAN
└── • scan
columns: (k, v)
estimated row count: 1,000 (missing stats)
table: kv@kv_pkey
spans: FULL SCAN

# Use Upsert with indexed table, default columns, computed columns, and check
# columns.
Expand Down
Loading

0 comments on commit a87cfe6

Please sign in to comment.