Skip to content

Commit

Permalink
opt: hoist uncorrelated equality subqueries
Browse files Browse the repository at this point in the history
Subqueries that are in equality expressions with a variable are now
hoisted. When these expressions exist in a filter, hoisting the subquery
can allow the main query to plan a lookup join, rather than an
inefficient full-table scan.

For example, consider the table and query:

    CREATE TABLE t (
      a INT,
      INDEX (a)
    );

    SELECT * FROM t WHERE a = (SELECT max(a) FROM t);

Prior to this commit, the query plan for this query required a full
table scan:

    select
     ├── columns: a:1
     ├── scan t@t_a_idx
     │    ├── columns: a:1
     │    └── constraint: /1/2: (/NULL - ]
     └── filters
          └── eq
               ├── a:1
               └── subquery
                    └── scalar-group-by
                         ├── columns: max:9
                         ├── scan t@t_a_idx,rev
                         │    ├── columns: a:5
                         │    ├── constraint: /5/6: (/NULL - ]
                         │    └── limit: 1(rev)
                         └── aggregations
                              └── const-agg [as=max:9, outer=(5)]
                                   └── a:5

By hoisting the subquery, the full table scan is replaced with a lookup
join:

    project
     ├── columns: a:1
     └── inner-join (lookup t@t_a_idx)
          ├── columns: a:1 max:9
          ├── key columns: [9] = [1]
          ├── scalar-group-by
          │    ├── columns: max:9
          │    ├── scan t@t_a_idx,rev
          │    │    ├── columns: a:5
          │    │    ├── constraint: /5/6: (/NULL - ]
          │    │    └── limit: 1(rev)
          │    └── aggregations
          │         └── const-agg [as=max:9, outer=(5)]
          │              └── a:5
          └── filters (true)

This hoisting is enabled by default, but can be disabled by setting the
`optimizer_hoist_uncorrelated_equality_subqueries` session setting to
`false`.

Fixes #83392
Informs #51820
Informs #93829
Informs #100855

Release note (performance improvement): Queries that have subqueries in
equality expressions are now more efficiently planned by the optimizer.
  • Loading branch information
mgartner committed Apr 12, 2023
1 parent d045de7 commit 209a9e0
Show file tree
Hide file tree
Showing 19 changed files with 726 additions and 466 deletions.
4 changes: 4 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3513,6 +3513,10 @@ func (m *sessionDataMutator) SetOptimizerAlwaysUseHistograms(val bool) {
m.data.OptimizerAlwaysUseHistograms = val
}

func (m *sessionDataMutator) SetOptimizerHoistUncorrelatedEqualitySubqueries(val bool) {
m.data.OptimizerHoistUncorrelatedEqualitySubqueries = val
}

func (m *sessionDataMutator) SetEnableCreateStatsUsingExtremes(val bool) {
m.data.EnableCreateStatsUsingExtremes = val
}
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/information_schema
Original file line number Diff line number Diff line change
Expand Up @@ -5278,6 +5278,7 @@ on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer on
optimizer_always_use_histograms on
optimizer_hoist_uncorrelated_equality_subqueries on
optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats on
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/pg_catalog
Original file line number Diff line number Diff line change
Expand Up @@ -2759,6 +2759,7 @@ null_ordered_last off NULL
on_update_rehome_row_enabled on NULL NULL NULL string
opt_split_scan_limit 2048 NULL NULL NULL string
optimizer_always_use_histograms on NULL NULL NULL string
optimizer_hoist_uncorrelated_equality_subqueries on NULL NULL NULL string
optimizer_use_forecasts on NULL NULL NULL string
optimizer_use_histograms on NULL NULL NULL string
optimizer_use_improved_disjunction_stats on NULL NULL NULL string
Expand Down Expand Up @@ -2913,6 +2914,7 @@ null_ordered_last off NULL
on_update_rehome_row_enabled on NULL user NULL on on
opt_split_scan_limit 2048 NULL user NULL 2048 2048
optimizer_always_use_histograms on NULL user NULL on on
optimizer_hoist_uncorrelated_equality_subqueries on NULL user NULL on on
optimizer_use_forecasts on NULL user NULL on on
optimizer_use_histograms on NULL user NULL on on
optimizer_use_improved_disjunction_stats on NULL user NULL on on
Expand Down Expand Up @@ -3067,6 +3069,7 @@ on_update_rehome_row_enabled NULL NULL NULL
opt_split_scan_limit NULL NULL NULL NULL NULL
optimizer NULL NULL NULL NULL NULL
optimizer_always_use_histograms NULL NULL NULL NULL NULL
optimizer_hoist_uncorrelated_equality_subqueries NULL NULL NULL NULL NULL
optimizer_use_forecasts NULL NULL NULL NULL NULL
optimizer_use_histograms NULL NULL NULL NULL NULL
optimizer_use_improved_disjunction_stats NULL NULL NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/logictest/testdata/logic_test/show_source
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ null_ordered_last off
on_update_rehome_row_enabled on
opt_split_scan_limit 2048
optimizer_always_use_histograms on
optimizer_hoist_uncorrelated_equality_subqueries on
optimizer_use_forecasts on
optimizer_use_histograms on
optimizer_use_improved_disjunction_stats on
Expand Down
80 changes: 37 additions & 43 deletions pkg/sql/opt/exec/execbuilder/testdata/subquery
Original file line number Diff line number Diff line change
Expand Up @@ -121,64 +121,58 @@ vectorized: true
• root
│ columns: (a, b, c)
├── • filter
├── • project
│ │ columns: (a, b, c)
│ │ estimated row count: 333 (missing stats)
│ │ filter: a = @S2
│ │
│ └── • scan
│ columns: (a, b, c)
│ estimated row count: 1,000 (missing stats)
│ table: abc@abc_pkey
│ spans: FULL SCAN
├── • subquery
│ │ id: @S1
│ │ original sql: (SELECT * FROM abc WHERE c = (a + 3))
│ │ exec mode: one row
│ │
│ └── • render
│ │ columns: (column16)
│ │ render column16: true
│ └── • lookup join (inner)
│ │ columns: (any_not_null, a, b, c)
│ │ estimated row count: 1 (missing stats)
│ │ table: abc@abc_pkey
│ │ equality: (any_not_null) = (a)
│ │ equality cols are key
│ │
│ └── • limit
│ │ columns: (a, c)
│ │ count: 1
│ └── • group (scalar)
│ │ columns: (any_not_null)
│ │ estimated row count: 1 (missing stats)
│ │ aggregate 0: any_not_null(a)
│ │
│ └── • filter
│ │ columns: (a, c)
│ │ estimated row count: 330 (missing stats)
│ │ filter: c = (a + 3)
│ └── • limit
│ │ columns: (a)
│ │ count: 1
│ │
│ └── • scan
│ columns: (a, c)
│ estimated row count: 1,000 (missing stats)
│ table: abc@abc_pkey
│ spans: FULL SCAN (SOFT LIMIT)
│ └── • filter
│ │ columns: (a)
│ │ ordering: -a
│ │ estimated row count: 333 (missing stats)
│ │ filter: COALESCE(@S1, false)
│ │
│ └── • revscan
│ columns: (a)
│ ordering: -a
│ estimated row count: 1,000 (missing stats)
│ table: abc@abc_pkey
│ spans: FULL SCAN (SOFT LIMIT)
└── • subquery
│ id: @S2
│ original sql: (SELECT max(a) FROM abc WHERE EXISTS (SELECT * FROM abc WHERE c = (a + 3)))
│ id: @S1
│ original sql: (SELECT * FROM abc WHERE c = (a + 3))
│ exec mode: one row
└── • group (scalar)
│ columns: (any_not_null)
│ estimated row count: 1 (missing stats)
│ aggregate 0: any_not_null(a)
└── • render
│ columns: (column16)
│ render column16: true
└── • limit
│ columns: (a)
│ columns: (a, c)
│ count: 1
└── • filter
│ columns: (a)
│ ordering: -a
│ estimated row count: 333 (missing stats)
│ filter: COALESCE(@S1, false)
│ columns: (a, c)
│ estimated row count: 330 (missing stats)
│ filter: c = (a + 3)
└── • revscan
columns: (a)
ordering: -a
└── • scan
columns: (a, c)
estimated row count: 1,000 (missing stats)
table: abc@abc_pkey
spans: FULL SCAN (SOFT LIMIT)
Expand Down
23 changes: 13 additions & 10 deletions pkg/sql/opt/exec/execbuilder/testdata/tpch_vec
Original file line number Diff line number Diff line change
Expand Up @@ -20848,17 +20848,20 @@ EXPLAIN (VEC) SELECT s_suppkey, s_name, s_address, s_phone, total_revenue FROM s
----
└ Node 1
└ *colexecjoin.mergeJoinInnerOp
├ *colfetcher.ColBatchScan
└ *rowexec.joinReader
└ *colexec.sortOp
└ *colexecsel.selEQFloat64Float64Op
└ *colexecbase.castOpNullAny
└ *colexecbase.constNullOp
└ *colexec.hashAggregator
└ *colexecproj.projMultFloat64Float64Op
└ *colexecprojconst.projMinusFloat64ConstFloat64Op
└ *colfetcher.ColIndexJoin
└ *colfetcher.ColBatchScan
└ *colexecjoin.hashJoiner
├ *colexec.hashAggregator
│ └ *colexecproj.projMultFloat64Float64Op
│ └ *colexecprojconst.projMinusFloat64ConstFloat64Op
│ └ *colfetcher.ColIndexJoin
│ └ *colfetcher.ColBatchScan
└ *colexec.orderedAggregator
└ *colexec.hashAggregator
└ *colexecproj.projMultFloat64Float64Op
└ *colexecprojconst.projMinusFloat64ConstFloat64Op
└ *colfetcher.ColIndexJoin
└ *colfetcher.ColBatchScan

statement ok
DROP VIEW revenue0
Expand Down
60 changes: 30 additions & 30 deletions pkg/sql/opt/exec/execbuilder/testdata/udf
Original file line number Diff line number Diff line change
Expand Up @@ -116,48 +116,48 @@ EXPLAIN (VERBOSE) SELECT * FROM sub3 WHERE sub_fn() = 3 AND (SELECT max(a) FROM
distribution: local
vectorized: true
·
root
project
│ columns: (a)
├── • filter
│ │ columns: (a)
│ │ estimated row count: 111 (missing stats)
│ │ filter: (sub_fn() = 3) AND (a = @S1)
│ │
│ └── • scan
│ columns: (a)
│ estimated row count: 1,000 (missing stats)
│ table: sub3@sub3_pkey
│ spans: FULL SCAN
└── • subquery
│ id: @S1
│ original sql: (SELECT max(a) FROM sub2)
│ exec mode: one row
└── • lookup join (inner)
│ columns: (any_not_null, a)
│ estimated row count: 1 (missing stats)
│ table: sub3@sub3_pkey
│ equality: (any_not_null) = (a)
│ equality cols are key
│ pred: sub_fn() = 3
└── • group (scalar)
└── • filter
│ columns: (any_not_null)
│ estimated row count: 1 (missing stats)
aggregate 0: any_not_null(a)
│ estimated row count: 0 (missing stats)
filter: sub_fn() = 3
└── • revscan
columns: (a)
estimated row count: 1 (missing stats)
table: sub2@sub2_pkey
spans: LIMITED SCAN
limit: 1
└── • group (scalar)
│ columns: (any_not_null)
│ estimated row count: 1 (missing stats)
│ aggregate 0: any_not_null(a)
└── • revscan
columns: (a)
estimated row count: 1 (missing stats)
table: sub2@sub2_pkey
spans: LIMITED SCAN
limit: 1

statement ok
CREATE FUNCTION sub_fn_lt() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE a < (SELECT max(a) FROM sub2)'

# The uncorrelated subquery in the UDF body is executed only once.
query T kvtrace
SELECT sub_fn()
SELECT sub_fn_lt()
----
Scan /Table/112/{1-2}
Scan /Table/113/{1-2}

# The uncorrelated subquery in the UDF body is executed only once per row
# produced by generate_series.
query T kvtrace
SELECT sub_fn() FROM generate_series(1, 3)
SELECT sub_fn_lt() FROM generate_series(1, 3)
----
Scan /Table/112/{1-2}
Scan /Table/113/{1-2}
Expand All @@ -174,12 +174,12 @@ CREATE FUNCTION sub_fn2() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE
query T kvtrace
SELECT sub_fn2() FROM generate_series(1, 3)
----
Scan /Table/112/{1-2}
Scan /Table/113/1/30/0
Scan /Table/112/{1-2}
Scan /Table/112/1/30/0
Scan /Table/113/1/30/0
Scan /Table/112/{1-2}
Scan /Table/112/1/30/0
Scan /Table/113/1/30/0
Scan /Table/112/1/30/0

statement ok
CREATE FUNCTION sub_fn3() RETURNS INT LANGUAGE SQL AS 'SELECT a FROM sub1 WHERE EXISTS (SELECT a FROM sub2 WHERE a = 30)'
Expand Down
5 changes: 4 additions & 1 deletion pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ type Memo struct {
useLimitOrderingForStreamingGroupBy bool
useImprovedSplitDisjunctionForJoins bool
alwaysUseHistograms bool
hoistUncorrelatedEqualitySubqueries bool

// curRank is the highest currently in-use scalar expression rank.
curRank opt.ScalarRank
Expand Down Expand Up @@ -221,6 +222,7 @@ func (m *Memo) Init(ctx context.Context, evalCtx *eval.Context) {
useLimitOrderingForStreamingGroupBy: evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy,
useImprovedSplitDisjunctionForJoins: evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins,
alwaysUseHistograms: evalCtx.SessionData().OptimizerAlwaysUseHistograms,
hoistUncorrelatedEqualitySubqueries: evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries,
}
m.metadata.Init()
m.logPropsBuilder.init(ctx, evalCtx, m)
Expand Down Expand Up @@ -362,7 +364,8 @@ func (m *Memo) IsStale(
m.useImprovedDisjunctionStats != evalCtx.SessionData().OptimizerUseImprovedDisjunctionStats ||
m.useLimitOrderingForStreamingGroupBy != evalCtx.SessionData().OptimizerUseLimitOrderingForStreamingGroupBy ||
m.useImprovedSplitDisjunctionForJoins != evalCtx.SessionData().OptimizerUseImprovedSplitDisjunctionForJoins ||
m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms {
m.alwaysUseHistograms != evalCtx.SessionData().OptimizerAlwaysUseHistograms ||
m.hoistUncorrelatedEqualitySubqueries != evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries {
return true, nil
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/opt/memo/memo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,12 @@ func TestMemoIsStale(t *testing.T) {
evalCtx.SessionData().OptimizerAlwaysUseHistograms = false
notStale()

// Stale optimizer_hoist_uncorrelated_equality_subqueries.
evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = true
stale()
evalCtx.SessionData().OptimizerHoistUncorrelatedEqualitySubqueries = false
notStale()

// Stale data sources and schema. Create new catalog so that data sources are
// recreated and can be modified independently.
catalog = testcat.New()
Expand Down
Loading

0 comments on commit 209a9e0

Please sign in to comment.