From ceba034018eb80898bc0daf6ee8043ab405be5de Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Thu, 7 Mar 2019 11:24:09 -0500 Subject: [PATCH] opt: add more cost for lookup joins with more ON conditions This is a very limited fix for #34810. The core problem is that we don't take into account that if we have an ON condition, not only there's a cost to evaluate it on each row, but we are generating more internal rows to get a given number of output rows. I attempted to do a more general fix (for all join types), where I tried to estimate the "internal" number of rows using `unknownFilterSelectivity` for each ON condition. There were two problems: - in some cases (especially with lookup joins) we have an extra ON condition that doesn't actually do anything: `ab JOIN xy ON a=x AND a=10` becomes `ab JOIN xy ON a=x AND a=10 AND x=10` becomes and `a=10` could remain as an ON condition. This results in bad query plans in important cases (e.g. TPCC) where it prefers to do an extra lookup join (due to a non-covering index) just because of that condition. - we don't have the equality columns readily available for hash join (and didn't want to extract them each time we cost). In the future we may split the planning into a logical and physical stage, and we should then separate the logical joins from hash join. For 19.1, we simply simply add a cost for lookup joins that is proportional to the number of remaining ON conditions. This is the least disruptive method that still fixes the case observed in #34810. I will leave the issue open to address this properly in the next release. Note that although hash joins and merge joins have the same issue in principle, in practice we always generate these expressions with equality on all possible columns. Release note: None --- pkg/sql/opt/optgen/exprgen/testdata/join | 2 +- pkg/sql/opt/xform/coster.go | 27 +++ pkg/sql/opt/xform/testdata/coster/join | 155 ++++++++++++++++++ pkg/sql/opt/xform/testdata/coster/zone | 4 +- pkg/sql/opt/xform/testdata/external/tpcc | 4 +- .../opt/xform/testdata/external/tpcc-no-stats | 8 +- pkg/sql/opt/xform/testdata/rules/join | 4 +- 7 files changed, 193 insertions(+), 11 deletions(-) diff --git a/pkg/sql/opt/optgen/exprgen/testdata/join b/pkg/sql/opt/optgen/exprgen/testdata/join index 72a44b63f45c..94d66d22cac6 100644 --- a/pkg/sql/opt/optgen/exprgen/testdata/join +++ b/pkg/sql/opt/optgen/exprgen/testdata/join @@ -66,7 +66,7 @@ left-join (lookup abc@ab) ├── columns: t.public.abc.a:5(int) t.public.abc.b:6(int) ├── key columns: [5] = [5] ├── stats: [rows=333333.333] - ├── cost: 355060.02 + ├── cost: 358393.353 ├── scan t.public.def │ ├── columns: t.public.def.d:1(int) t.public.def.e:2(int) │ ├── stats: [rows=1000] diff --git a/pkg/sql/opt/xform/coster.go b/pkg/sql/opt/xform/coster.go index 95a316922567..ebf5e9722ec1 100644 --- a/pkg/sql/opt/xform/coster.go +++ b/pkg/sql/opt/xform/coster.go @@ -363,6 +363,33 @@ func (c *coster) computeLookupJoinCost(join *memo.LookupJoinExpr) memo.Cost { // cost of emitting the rows. numLookupCols := join.Cols.Difference(join.Input.Relational().OutputCols).Len() perRowCost := seqIOCostFactor + c.rowScanCost(join.Table, join.Index, numLookupCols) + + // Add a cost if we have to evaluate an ON condition on every row. The more + // leftover conditions, the more expensive it should be. We want to + // differentiate between two lookup joins where one uses only a subset of the + // columns. For example: + // abc JOIN xyz ON a=x AND b=y + // We could have a lookup join using an index on y (and left-over condition + // a=x), and another lookup join on an index on x,y. The latter is definitely + // preferable (the former could generate a lot of internal results that are + // then discarded). + // + // TODO(radu): we should take into account that the "internal" row count is + // higher, according to the selectivities of the conditions. Unfortunately + // this is very tricky, in particular because of left-over conditions that are + // not selective. + // For example: + // ab JOIN xy ON a=x AND x=10 + // becomes (during normalization): + // ab JOIN xy ON a=x AND a=10 AND x=10 + // which can become a lookup join with left-over condition x=10 which doesn't + // actually filter anything. + // + // TODO(radu): this should be extended to all join types. It's tricky for hash + // joins where we don't have the equality and leftover filters readily + // available. + perRowCost += cpuCostFactor * memo.Cost(len(join.On)) + cost += memo.Cost(join.Relational().Stats.RowCount) * perRowCost return cost } diff --git a/pkg/sql/opt/xform/testdata/coster/join b/pkg/sql/opt/xform/testdata/coster/join index 09417b25aa97..7e47f37927eb 100644 --- a/pkg/sql/opt/xform/testdata/coster/join +++ b/pkg/sql/opt/xform/testdata/coster/join @@ -166,3 +166,158 @@ index-join abc ├── cost: 10.306 ├── key: (1) └── fd: ()-->(3) + +# Regression test for #34810: make sure we pick the lookup join that uses +# all equality columns. + +exec-ddl +CREATE TABLE abcde ( + a TEXT NOT NULL, + b UUID NOT NULL, + c UUID NOT NULL, + d VARCHAR(255) NOT NULL, + e TEXT NOT NULL, + CONSTRAINT "primary" PRIMARY KEY (a, b, c), + UNIQUE INDEX idx_abd (a, b, d), + UNIQUE INDEX idx_abcd (a, b, c, d) +) +---- +TABLE abcde + ├── a string not null + ├── b uuid not null + ├── c uuid not null + ├── d string not null + ├── e string not null + ├── INDEX primary + │ ├── a string not null + │ ├── b uuid not null + │ └── c uuid not null + ├── INDEX idx_abd + │ ├── a string not null + │ ├── b uuid not null + │ ├── d string not null + │ └── c uuid not null (storing) + └── INDEX idx_abcd + ├── a string not null + ├── b uuid not null + ├── c uuid not null + └── d string not null + +exec-ddl +ALTER TABLE abcde INJECT STATISTICS '[ + { + "columns": ["a"], + "created_at": "2019-02-08 04:10:40.001179+00:00", + "row_count": 250000, + "distinct_count": 1 + }, + { + "columns": ["b"], + "created_at": "2019-02-08 04:10:40.119954+00:00", + "row_count": 250000, + "distinct_count": 2 + }, + { + "columns": ["d"], + "created_at": "2019-02-08 04:10:40.119954+00:00", + "row_count": 250000, + "distinct_count": 125000 + } +]' +---- + +exec-ddl +CREATE TABLE wxyz ( + w TEXT NOT NULL, + x UUID NOT NULL, + y UUID NOT NULL, + z TEXT NOT NULL, + CONSTRAINT "primary" PRIMARY KEY (w, x, y), + CONSTRAINT "foreign" FOREIGN KEY (w, x, y) REFERENCES abcde (a, b, c) +) +---- +TABLE wxyz + ├── w string not null + ├── x uuid not null + ├── y uuid not null + ├── z string not null + ├── INDEX primary + │ ├── w string not null + │ ├── x uuid not null + │ └── y uuid not null + └── FOREIGN KEY (w, x, y) REFERENCES t.public.abcde (a, b, c) + +exec-ddl +ALTER TABLE wxyz INJECT STATISTICS '[ + { + "columns": ["w"], + "created_at": "2019-02-08 04:10:40.001179+00:00", + "row_count": 10000, + "distinct_count": 1 + }, + { + "columns": ["x"], + "created_at": "2019-02-08 04:10:40.119954+00:00", + "row_count": 10000, + "distinct_count": 1 + }, + { + "columns": ["y"], + "created_at": "2019-02-08 04:10:40.119954+00:00", + "row_count": 10000, + "distinct_count": 2500 + } +]' +---- + +opt +SELECT w, x, y, z +FROM wxyz +INNER JOIN abcde +ON w = a AND x = b AND y = c +WHERE w = 'foo' AND x = '2AB23800-06B1-4E19-A3BB-DF3768B808D2' +ORDER BY d +LIMIT 10 +---- +project + ├── columns: w:1(string!null) x:2(uuid!null) y:3(uuid!null) z:4(string!null) [hidden: d:8(string!null)] + ├── cardinality: [0 - 10] + ├── stats: [rows=10] + ├── cost: 122481.301 + ├── key: (8) + ├── fd: ()-->(1,2), (3)-->(4,8), (8)-->(3,4) + ├── ordering: +8 opt(1,2) [actual: +8] + └── limit + ├── columns: w:1(string!null) x:2(uuid!null) y:3(uuid!null) z:4(string!null) a:5(string!null) b:6(uuid!null) c:7(uuid!null) d:8(string!null) + ├── internal-ordering: +8 opt(1,2,5,6) + ├── cardinality: [0 - 10] + ├── stats: [rows=10] + ├── cost: 122481.191 + ├── key: (7) + ├── fd: ()-->(1,2,5,6), (3)-->(4), (7)-->(8), (8)-->(7), (1)==(5), (5)==(1), (2)==(6), (6)==(2), (3)==(7), (7)==(3) + ├── ordering: +8 opt(1,2,5,6) [actual: +8] + ├── sort + │ ├── columns: w:1(string!null) x:2(uuid!null) y:3(uuid!null) z:4(string!null) a:5(string!null) b:6(uuid!null) c:7(uuid!null) d:8(string!null) + │ ├── stats: [rows=50048.8759, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=2500, null(3)=0, distinct(4)=1000, null(4)=0, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(7)=2500, null(7)=0, distinct(8)=38781.1698, null(8)=0] + │ ├── cost: 122481.081 + │ ├── key: (7) + │ ├── fd: ()-->(1,2,5,6), (3)-->(4), (7)-->(8), (8)-->(7), (1)==(5), (5)==(1), (2)==(6), (6)==(2), (3)==(7), (7)==(3) + │ ├── ordering: +8 opt(1,2,5,6) [actual: +8] + │ └── inner-join (lookup abcde@idx_abcd) + │ ├── columns: w:1(string!null) x:2(uuid!null) y:3(uuid!null) z:4(string!null) a:5(string!null) b:6(uuid!null) c:7(uuid!null) d:8(string!null) + │ ├── key columns: [1 2 3] = [5 6 7] + │ ├── stats: [rows=50048.8759, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=2500, null(3)=0, distinct(4)=1000, null(4)=0, distinct(5)=1, null(5)=0, distinct(6)=1, null(6)=0, distinct(7)=2500, null(7)=0, distinct(8)=38781.1698, null(8)=0] + │ ├── cost: 105853.783 + │ ├── key: (7) + │ ├── fd: ()-->(1,2,5,6), (3)-->(4), (7)-->(8), (8)-->(7), (1)==(5), (5)==(1), (2)==(6), (6)==(2), (3)==(7), (7)==(3) + │ ├── scan wxyz + │ │ ├── columns: w:1(string!null) x:2(uuid!null) y:3(uuid!null) z:4(string!null) + │ │ ├── constraint: /1/2/3: [/'foo'/'2ab23800-06b1-4e19-a3bb-df3768b808d2' - /'foo'/'2ab23800-06b1-4e19-a3bb-df3768b808d2'] + │ │ ├── stats: [rows=10000, distinct(1)=1, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=2500, null(3)=0, distinct(4)=1000, null(4)=0] + │ │ ├── cost: 10800.01 + │ │ ├── key: (3) + │ │ └── fd: ()-->(1,2), (3)-->(4) + │ └── filters + │ ├── a = 'foo' [type=bool, outer=(5), constraints=(/5: [/'foo' - /'foo']; tight), fd=()-->(5)] + │ └── b = '2ab23800-06b1-4e19-a3bb-df3768b808d2' [type=bool, outer=(6), constraints=(/6: [/'2ab23800-06b1-4e19-a3bb-df3768b808d2' - /'2ab23800-06b1-4e19-a3bb-df3768b808d2']; tight), fd=()-->(6)] + └── const: 10 [type=int] diff --git a/pkg/sql/opt/xform/testdata/coster/zone b/pkg/sql/opt/xform/testdata/coster/zone index 26432a72067a..3eb9e44ce7bc 100644 --- a/pkg/sql/opt/xform/testdata/coster/zone +++ b/pkg/sql/opt/xform/testdata/coster/zone @@ -317,7 +317,7 @@ inner-join (lookup xy@y2) ├── flags: no-merge-join;no-hash-join ├── key columns: [2] = [5] ├── stats: [rows=98.01, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0, distinct(4)=9.9, null(4)=0, distinct(5)=1, null(5)=0] - ├── cost: 152.0444 + ├── cost: 153.0245 ├── key: (1,4) ├── fd: ()-->(2,5), (1)-->(3), (2,3)~~>(1), (2)==(5), (5)==(2) ├── prune: (1,3,4) @@ -359,7 +359,7 @@ inner-join (lookup xy@y1) ├── flags: no-merge-join;no-hash-join ├── key columns: [2] = [5] ├── stats: [rows=98.01, distinct(1)=9.9, null(1)=0, distinct(2)=1, null(2)=0, distinct(4)=9.9, null(4)=0, distinct(5)=1, null(5)=0] - ├── cost: 152.0444 + ├── cost: 153.0245 ├── key: (1,4) ├── fd: ()-->(2,5), (1)-->(3), (2,3)~~>(1), (2)==(5), (5)==(2) ├── prune: (1,3,4) diff --git a/pkg/sql/opt/xform/testdata/external/tpcc b/pkg/sql/opt/xform/testdata/external/tpcc index 00feeb0304c9..a58eb257b0ed 100644 --- a/pkg/sql/opt/xform/testdata/external/tpcc +++ b/pkg/sql/opt/xform/testdata/external/tpcc @@ -904,7 +904,7 @@ scalar-group-by ├── columns: count:28(int) ├── cardinality: [1 - 1] ├── stats: [rows=1] - ├── cost: 1.82111111 + ├── cost: 1.84111111 ├── key: () ├── fd: ()-->(28) ├── prune: (28) @@ -912,7 +912,7 @@ scalar-group-by │ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null) │ ├── key columns: [3 5] = [12 11] │ ├── stats: [rows=1, distinct(1)=0.11109736, null(1)=0, distinct(2)=0.111097416, null(2)=0, distinct(3)=0.111111111, null(3)=0, distinct(5)=0.111111056, null(5)=0, distinct(11)=0.111111056, null(11)=0, distinct(12)=0.111111111, null(12)=0, distinct(13)=1, null(13)=0] - │ ├── cost: 1.79111111 + │ ├── cost: 1.81111111 │ ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3) │ ├── interesting orderings: (+3,+2,-1) │ ├── scan order_line diff --git a/pkg/sql/opt/xform/testdata/external/tpcc-no-stats b/pkg/sql/opt/xform/testdata/external/tpcc-no-stats index 636cd49d3b7a..e9a6331c7d78 100644 --- a/pkg/sql/opt/xform/testdata/external/tpcc-no-stats +++ b/pkg/sql/opt/xform/testdata/external/tpcc-no-stats @@ -709,7 +709,7 @@ scalar-group-by ├── columns: count:28(int) ├── cardinality: [1 - 1] ├── stats: [rows=1] - ├── cost: 0.141477778 + ├── cost: 0.142211111 ├── key: () ├── fd: ()-->(28) ├── prune: (28) @@ -717,7 +717,7 @@ scalar-group-by │ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null) │ ├── key columns: [3 5] = [12 11] │ ├── stats: [rows=0.0366666667, distinct(1)=0.0111105556, null(1)=0, distinct(2)=0.0111111111, null(2)=0, distinct(3)=0.0111111111, null(3)=0, distinct(5)=0.0111105556, null(5)=0, distinct(11)=0.0111105556, null(11)=0, distinct(12)=0.0111111111, null(12)=0, distinct(13)=0.0366666667, null(13)=0] - │ ├── cost: 0.121111111 + │ ├── cost: 0.121844444 │ ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3) │ ├── interesting orderings: (+3,+2,-1) │ ├── scan order_line @@ -762,7 +762,7 @@ scalar-group-by ├── columns: count:22(int) ├── cardinality: [1 - 1] ├── stats: [rows=1] - ├── cost: 1588.01 + ├── cost: 1588.34 ├── key: () ├── fd: ()-->(22) ├── prune: (22) @@ -770,7 +770,7 @@ scalar-group-by │ ├── columns: w_id:1(int!null) w_ytd:9(decimal!null) d_w_id:11(int!null) sum:21(decimal!null) │ ├── key columns: [11] = [1] │ ├── stats: [rows=33, distinct(1)=33, null(1)=0, distinct(9)=28.3508504, null(9)=0, distinct(11)=33, null(11)=0, distinct(21)=28.3508504, null(21)=0] - │ ├── cost: 1587.66 + │ ├── cost: 1587.99 │ ├── key: (11) │ ├── fd: (1)-->(9), (11)-->(21), (1)==(11), (11)==(1) │ ├── interesting orderings: (+11) diff --git a/pkg/sql/opt/xform/testdata/rules/join b/pkg/sql/opt/xform/testdata/rules/join index dae0ca231323..00d22c265e4e 100644 --- a/pkg/sql/opt/xform/testdata/rules/join +++ b/pkg/sql/opt/xform/testdata/rules/join @@ -1879,11 +1879,11 @@ memo (optimized, ~11KB, required=[presentation: a:1]) ├── G1: (project G2 G3 a) │ └── [presentation: a:1] │ ├── best: (project G2 G3 a) - │ └── cost: 87.93 + │ └── cost: 88.05 ├── G2: (select G4 G5) (lookup-join G6 G5 t5,keyCols=[1],outCols=(1,2)) (select G7 G5) │ └── [] │ ├── best: (lookup-join G6 G5 t5,keyCols=[1],outCols=(1,2)) - │ └── cost: 87.80 + │ └── cost: 87.92 ├── G3: (projections) ├── G4: (scan t5,cols=(1,2)) │ └── []