From 785566346fb2173e27f84c0ca85c28d5950809e6 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Mon, 20 Aug 2018 17:19:59 -0400 Subject: [PATCH 1/3] opt: add disable flag to opttester Add a flag to norm/opt that disables specific rules. Release note: None --- pkg/sql/opt/testutils/opt_tester.go | 93 ++++++++++++++++++++++------- pkg/sql/opt/xform/optimizer.go | 7 --- 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/pkg/sql/opt/testutils/opt_tester.go b/pkg/sql/opt/testutils/opt_tester.go index ae2cfe6a58a9..f52db7a1d393 100644 --- a/pkg/sql/opt/testutils/opt_tester.go +++ b/pkg/sql/opt/testutils/opt_tester.go @@ -87,9 +87,12 @@ type OptTesterFlags struct { // output to stdout when commands run. Only certain commands support this. Verbose bool + // DisableRules is a set of rules that are not allowed to run. + DisableRules map[opt.RuleName]struct{} + // ExploreTraceRule restricts the ExploreTrace output to only show the effects // of a specific rule. - ExploreTraceRule string + ExploreTraceRule opt.RuleName ColStats []opt.ColSet } @@ -165,6 +168,10 @@ func NewOptTester(catalog opt.Catalog, sql string) *OptTester { // // - fully-qualify-names: fully qualify all column names in the test output. // +// - disable: disables optimizer rules by name. Examples: +// opt disable=ConstrainScan +// norm disable=(NegateOr,NegateAnd) +// // - rule: used with exploretrace; the value is the name of a rule. When // specified, the exploretrace output is filtered to only show expression // changes due to that specific rule. @@ -326,11 +333,30 @@ func (f *OptTesterFlags) Set(arg datadriven.CmdArg) error { // Hiding qualifications defeats the purpose. f.ExprFormat &= ^memo.ExprFmtHideQualifications + case "disable": + if len(arg.Vals) == 0 { + return fmt.Errorf("disable requires arguments") + } + if f.DisableRules == nil { + f.DisableRules = make(map[opt.RuleName]struct{}) + } + for _, s := range arg.Vals { + r, err := ruleFromString(s) + if err != nil { + return err + } + f.DisableRules[r] = struct{}{} + } + case "rule": if len(arg.Vals) != 1 { return fmt.Errorf("rule requires one argument") } - f.ExploreTraceRule = arg.Vals[0] + var err error + f.ExploreTraceRule, err = ruleFromString(arg.Vals[0]) + if err != nil { + return err + } case "colstat": if len(arg.Vals) == 0 { @@ -356,21 +382,40 @@ func (f *OptTesterFlags) Set(arg datadriven.CmdArg) error { // transformations applied to it. The untouched output of the optbuilder is the // final expression tree. func (ot *OptTester) OptBuild() (memo.ExprView, error) { - return ot.optimizeExpr(false /* allowNormalizations */, false /* allowExplorations */) + o := ot.makeOptimizer() + o.DisableOptimizations() + return ot.optimizeExpr(o) } // OptNorm constructs an opt expression tree for the SQL query, with all // normalization transformations applied to it. The normalized output of the // optbuilder is the final expression tree. func (ot *OptTester) OptNorm() (memo.ExprView, error) { - return ot.optimizeExpr(true /* allowNormalizations */, false /* allowExplorations */) + o := ot.makeOptimizer() + o.NotifyOnMatchedRule(func(ruleName opt.RuleName) bool { + if !ruleName.IsNormalize() { + return false + } + if _, disabled := ot.Flags.DisableRules[ruleName]; disabled { + return false + } + return true + }) + return ot.optimizeExpr(o) } // Optimize constructs an opt expression tree for the SQL query, with all // transformations applied to it. The result is the memo expression tree with // the lowest estimated cost. func (ot *OptTester) Optimize() (memo.ExprView, error) { - return ot.optimizeExpr(true /* allowNormalizations */, true /* allowExplorations */) + o := ot.makeOptimizer() + o.NotifyOnMatchedRule(func(ruleName opt.RuleName) bool { + if _, disabled := ot.Flags.DisableRules[ruleName]; disabled { + return false + } + return true + }) + return ot.optimizeExpr(o) } // Memo returns a string that shows the memo data structure that is constructed @@ -399,20 +444,16 @@ func (ot *OptTester) RuleStats() (string, error) { stats[i].rule = opt.RuleName(i) } - var o xform.Optimizer - o.Init(&ot.evalCtx) - + o := ot.makeOptimizer() o.NotifyOnAppliedRule( func(ruleName opt.RuleName, group memo.GroupID, expr memo.ExprOrdinal, added int) { stats[ruleName].numApplied++ stats[ruleName].numAdded += added }, ) - root, required, err := ot.buildExpr(o.Factory()) - if err != nil { + if _, err := ot.optimizeExpr(o); err != nil { return "", err } - o.Optimize(root, required) // Split the rules. var norm, explore []ruleStats @@ -629,7 +670,8 @@ func (ot *OptTester) ExploreTrace() (string, error) { break } - if ot.Flags.ExploreTraceRule != "" && et.LastRuleName().String() != ot.Flags.ExploreTraceRule { + if ot.Flags.ExploreTraceRule != opt.InvalidRuleName && + et.LastRuleName() != ot.Flags.ExploreTraceRule { continue } @@ -669,18 +711,13 @@ func (ot *OptTester) buildExpr( return b.Build() } -func (ot *OptTester) optimizeExpr( - allowNormalizations, allowExplorations bool, -) (memo.ExprView, error) { +func (ot *OptTester) makeOptimizer() *xform.Optimizer { var o xform.Optimizer o.Init(&ot.evalCtx) - if !allowExplorations { - if allowNormalizations { - o.DisableExplorations() - } else { - o.DisableOptimizations() - } - } + return &o +} + +func (ot *OptTester) optimizeExpr(o *xform.Optimizer) (memo.ExprView, error) { root, required, err := ot.buildExpr(o.Factory()) if err != nil { return memo.ExprView{}, err @@ -706,3 +743,15 @@ func (ot *OptTester) indent(str string) { ot.output(" %s\n", line) } } + +// ruleFromString returns the rule that matches the given string, +// or InvalidRuleName if there is no such rule. +func ruleFromString(str string) (opt.RuleName, error) { + for i := opt.RuleName(1); i < opt.NumRuleNames; i++ { + if i.String() == str { + return i, nil + } + } + + return opt.InvalidRuleName, fmt.Errorf("rule '%s' does not exist", str) +} diff --git a/pkg/sql/opt/xform/optimizer.go b/pkg/sql/opt/xform/optimizer.go index 44530ddb2c0c..fcd81b8d1de4 100644 --- a/pkg/sql/opt/xform/optimizer.go +++ b/pkg/sql/opt/xform/optimizer.go @@ -130,13 +130,6 @@ func (o *Optimizer) DisableOptimizations() { o.NotifyOnMatchedRule(func(opt.RuleName) bool { return false }) } -// DisableExplorations disables all explore rules. The normalized expression -// tree becomes the output expression tree (because no explore transforms are -// applied). -func (o *Optimizer) DisableExplorations() { - o.NotifyOnMatchedRule(func(ruleName opt.RuleName) bool { return ruleName.IsNormalize() }) -} - // NotifyOnMatchedRule sets a callback function which is invoked each time an // optimization rule (Normalize or Explore) has been matched by the optimizer. // If matchedRule is nil, then no notifications are sent, and all rules are From e1401e97b644fa201a319e8d56e3f727db1f2e66 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Mon, 20 Aug 2018 17:20:43 -0400 Subject: [PATCH 2/3] opt: add testcases showing stats problem Release note: None --- pkg/sql/opt/memo/testdata/stats/join | 80 ++++++++++++++++++++++++++ pkg/sql/opt/memo/testdata/stats/select | 67 +++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/pkg/sql/opt/memo/testdata/stats/join b/pkg/sql/opt/memo/testdata/stats/join index 57afcd76743d..259ea2f8edb6 100644 --- a/pkg/sql/opt/memo/testdata/stats/join +++ b/pkg/sql/opt/memo/testdata/stats/join @@ -689,3 +689,83 @@ inner-join (merge) ├── right ordering: +5 └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)] └── a = e [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])] + +exec-ddl +CREATE TABLE uvw (u INT, v INT, w INT) +---- +TABLE uvw + ├── u int + ├── v int + ├── w int + ├── rowid int not null (hidden) + └── INDEX primary + └── rowid int not null (hidden) + +exec-ddl +CREATE TABLE xyz (x INT, y INT, z INT) +---- +TABLE xyz + ├── x int + ├── y int + ├── z int + ├── rowid int not null (hidden) + └── INDEX primary + └── rowid int not null (hidden) + +# Verify that two equivalent formulations of a join lead to similar statistics. +# In the first case, x=10 is pushed down; in the second case it is part of the +# ON condition. The latter formulation happens in practice when we convert to +# lookup join (we incorporate the filter back into the ON condition). +# TODO(radu): the second case has orders of magnitude smaller row count; fix +# this. + +norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight) +SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN (SELECT * FROM xyz WHERE x=10) ON u=x +---- +inner-join + ├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int) + ├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1] + ├── fd: ()-->(1,3,5), (1)==(5), (5)==(1) + ├── select + │ ├── columns: u:1(int) v:2(int) w:3(int!null) + │ ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1] + │ ├── fd: ()-->(3) + │ ├── scan uvw + │ │ ├── columns: u:1(int) v:2(int) w:3(int) + │ │ └── stats: [rows=1000, distinct(1)=700, distinct(3)=700] + │ └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)] + │ └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)] + ├── select + │ ├── columns: x:5(int!null) y:6(int) z:7(int) + │ ├── stats: [rows=1.42857143, distinct(5)=1] + │ ├── fd: ()-->(5) + │ ├── scan xyz + │ │ ├── columns: x:5(int) y:6(int) z:7(int) + │ │ └── stats: [rows=1000, distinct(5)=700] + │ └── filters [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight), fd=()-->(5)] + │ └── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)] + └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)] + └── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])] + +norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight) +SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN xyz ON u=x AND x=10 +---- +inner-join + ├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int) + ├── stats: [rows=0.0029154519, distinct(1)=0.0029154519, distinct(5)=0.0029154519] + ├── fd: ()-->(1,3,5), (1)==(5), (5)==(1) + ├── select + │ ├── columns: u:1(int) v:2(int) w:3(int!null) + │ ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1] + │ ├── fd: ()-->(3) + │ ├── scan uvw + │ │ ├── columns: u:1(int) v:2(int) w:3(int) + │ │ └── stats: [rows=1000, distinct(1)=700, distinct(3)=700] + │ └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)] + │ └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)] + ├── scan xyz + │ ├── columns: x:5(int) y:6(int) z:7(int) + │ └── stats: [rows=1000, distinct(5)=700] + └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: [/10 - /10]), fd=()-->(1,5), (1)==(5), (5)==(1)] + ├── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])] + └── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)] diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select index c6ba89f7fbd5..e3951a8dbc7d 100644 --- a/pkg/sql/opt/memo/testdata/stats/select +++ b/pkg/sql/opt/memo/testdata/stats/select @@ -588,3 +588,70 @@ select └── filters [type=bool, outer=(1), constraints=(/1: [/0 - /99]; tight)] ├── x >= 0 [type=bool, outer=(1), constraints=(/1: [/0 - ]; tight)] └── x < 100 [type=bool, outer=(1), constraints=(/1: (/NULL - /99]; tight)] + +exec-ddl +CREATE TABLE uvw (u INT, v INT, w INT) +---- +TABLE uvw + ├── u int + ├── v int + ├── w int + ├── rowid int not null (hidden) + └── INDEX primary + └── rowid int not null (hidden) + +# Test selectivity calculations by applying the two constraints in different +# orders. +# TODO(radu): applying both constraints at the same time results in much lower +# estimates. +norm +SELECT * FROM uvw WHERE u=v AND u=10 +---- +select + ├── columns: u:1(int!null) v:2(int!null) w:3(int) + ├── stats: [rows=0.00204081633, distinct(1)=0.00204081633, distinct(2)=0.00204081633] + ├── fd: ()-->(1,2), (1)==(2), (2)==(1) + ├── scan uvw + │ ├── columns: u:1(int) v:2(int) w:3(int) + │ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700] + └── filters [type=bool, outer=(1,2), constraints=(/1: [/10 - /10]; /2: (/NULL - ]), fd=()-->(1,2), (1)==(2), (2)==(1)] + ├── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])] + └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)] + +norm disable=MergeSelects +SELECT * FROM (SELECT * FROM uvw WHERE u=10) WHERE u=v +---- +select + ├── columns: u:1(int!null) v:2(int!null) w:3(int) + ├── stats: [rows=1.0003063, distinct(1)=1, distinct(2)=1] + ├── fd: ()-->(1,2), (1)==(2), (2)==(1) + ├── select + │ ├── columns: u:1(int!null) v:2(int) w:3(int) + │ ├── stats: [rows=1.42857143, distinct(1)=1, distinct(2)=1.42813399] + │ ├── fd: ()-->(1) + │ ├── scan uvw + │ │ ├── columns: u:1(int) v:2(int) w:3(int) + │ │ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700] + │ └── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)] + │ └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)] + └── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)] + └── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])] + +norm disable=MergeSelects +SELECT * FROM (SELECT * FROM uvw WHERE u=v) WHERE u=10 +---- +select + ├── columns: u:1(int!null) v:2(int!null) w:3(int) + ├── stats: [rows=1, distinct(1)=1] + ├── fd: ()-->(1,2), (1)==(2), (2)==(1) + ├── select + │ ├── columns: u:1(int!null) v:2(int!null) w:3(int) + │ ├── stats: [rows=1.42857143, distinct(1)=1.42857143, distinct(2)=1.42857143] + │ ├── fd: (1)==(2), (2)==(1) + │ ├── scan uvw + │ │ ├── columns: u:1(int) v:2(int) w:3(int) + │ │ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700] + │ └── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)] + │ └── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])] + └── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)] + └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)] From eb7d824bd232477a2df4021377a9245db730a833 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Wed, 15 Aug 2018 11:14:44 -0400 Subject: [PATCH 3/3] opt: reorganize selectivity calculations Fixing a few issues with selectivity calculations, related to the handling of constraints in conjunction with equivalencies. In principle, applying the filter `a=10 AND a=b` should result in similar stats with applying `a=10` and then applying `a=b` but this was not the case. Changes: - We were updating the distinct counts to take into account the equivalencies before we estimated selectivity. This can lead to the selectivity of some constraints effectively being applied twice. The call to applyEquivalencies is moved after the selectivity calculations. - When calculating selectivity from distinct counts we were taking into account equivalencies, even though we later apply the selectivity of equivalencies separately. The fix is to only look at the distinct count of each column, rather than taking the minimum across the equivalency group. - When calculating selectivity for equivalency, we were looking at the "input" distinct counts. But if some distinct counts were updated by a constraint, we want to use the updated values; so the code now checks `ColStats` first. An example with some intuition for this: when applying a filter `a=10 AND a=b`, the behavior should be the same with first applying filter `a=10` and then applying `a=b`. So the "input" to the equivalency calculation needs to reflect `a=10`. - Finally, another difference when handling a conjunction is that we only calculate RowCount and bound the distinct counts at the end. We now update the RowCount "in step" with Selectivity; in addition, RowCount is used as an upper bound when consulting "input" stats. Release note: None --- pkg/sql/opt/memo/statistics_builder.go | 213 +++++++------------- pkg/sql/opt/memo/statistics_builder_test.go | 11 +- pkg/sql/opt/memo/testdata/stats/join | 4 +- pkg/sql/opt/memo/testdata/stats/select | 6 +- pkg/sql/opt/props/statistics.go | 56 +++++ pkg/sql/opt/xform/testdata/external/tpcc | 6 +- 6 files changed, 136 insertions(+), 160 deletions(-) diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 6a7f08679757..41ad4de2ef26 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -444,28 +444,27 @@ func (sb *statisticsBuilder) buildScan(ev ExprView, relProps *props.Relational) } def := ev.Private().(*ScanOpDef) + inputStats := sb.makeTableStatistics(def.Table) + s.RowCount = inputStats.RowCount + if def.Constraint != nil { // Calculate distinct counts for constrained columns // ------------------------------------------------- applied := sb.applyConstraint(def.Constraint, ev, relProps) - // Calculate selectivity - // --------------------- + // Calculate row count and selectivity + // ----------------------------------- if applied { var cols opt.ColSet for i := 0; i < def.Constraint.Columns.Count(); i++ { cols.Add(int(def.Constraint.Columns.Get(i).ID())) } - s.Selectivity = sb.selectivityFromDistinctCounts(cols, ev, relProps) + s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, ev, s)) } else { - s.Selectivity = sb.selectivityFromUnappliedConstraints(1 /* numUnappliedConstraints */) + s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(1 /* numUnappliedConstraints */)) } } - // Calculate row count - // ------------------- - inputStats := sb.makeTableStatistics(def.Table) - sb.applySelectivity(inputStats.RowCount, s) sb.finalizeFromCardinality(relProps) } @@ -477,7 +476,7 @@ func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, ev ExprView) *props. colStat := sb.copyColStat(colSet, s, sb.colStatTable(def.Table, colSet)) if s.Selectivity != 1 { tableStats := sb.makeTableStatistics(def.Table) - sb.applySelectivityToColStat(colStat, s.Selectivity, tableStats.RowCount) + colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount) } // Cap distinct count at limit, if it exists. @@ -535,23 +534,25 @@ func (sb *statisticsBuilder) buildSelect(ev ExprView, relProps *props.Relational // Calculate distinct counts for constrained columns // ------------------------------------------------- - numUnappliedConstraints, constrainedCols := sb.applyFilter(filter, equivReps, ev, relProps) + numUnappliedConstraints, constrainedCols := sb.applyFilter(filter, ev, relProps) // Try to reduce the number of columns used for selectivity // calculation based on functional dependencies. inputFD := &ev.Child(0).Logical().Relational.FuncDeps constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD) - // Calculate selectivity - // --------------------- - s.Selectivity = sb.selectivityFromDistinctCounts(constrainedCols, ev, relProps) - s.Selectivity *= sb.selectivityFromEquivalencies(equivReps, filterFD, ev, relProps) - s.Selectivity *= sb.selectivityFromUnappliedConstraints(numUnappliedConstraints) - - // Calculate row count - // ------------------- + // Calculate selectivity and row count + // ----------------------------------- inputStats := &ev.childGroup(0).logical.Relational.Stats - sb.applySelectivity(inputStats.RowCount, s) + s.RowCount = inputStats.RowCount + s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols, ev, s)) + s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, filterFD, ev, s)) + s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(numUnappliedConstraints)) + + // Update distinct counts based on equivalencies; this should happen after + // selectivityFromDistinctCounts and selectivityFromEquivalencies. + sb.applyEquivalencies(equivReps, filterFD, ev, relProps) + sb.finalizeFromCardinality(relProps) } @@ -560,7 +561,7 @@ func (sb *statisticsBuilder) colStatSelect(colSet opt.ColSet, ev ExprView) *prop s := &relProps.Stats inputStats := &ev.childGroup(0).logical.Relational.Stats colStat := sb.copyColStatFromChild(colSet, ev, 0 /* childIdx */, s) - sb.applySelectivityToColStat(colStat, s.Selectivity, inputStats.RowCount) + colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount) return colStat } @@ -688,7 +689,7 @@ func (sb *statisticsBuilder) buildJoin(ev ExprView, relProps *props.Relational) // Calculate distinct counts for constrained columns in the ON conditions // ---------------------------------------------------------------------- - numUnappliedConstraints, constrainedCols := sb.applyFilter(on, equivReps, ev, relProps) + numUnappliedConstraints, constrainedCols := sb.applyFilter(on, ev, relProps) // Try to reduce the number of columns used for selectivity // calculation based on functional dependencies. @@ -698,16 +699,16 @@ func (sb *statisticsBuilder) buildJoin(ev ExprView, relProps *props.Relational) rightCols = sb.tryReduceCols(rightCols, s, &rightProps.FuncDeps) constrainedCols = leftCols.Union(rightCols) - // Calculate selectivity - // --------------------- - s.Selectivity = sb.selectivityFromDistinctCounts(constrainedCols, ev, relProps) - s.Selectivity *= sb.selectivityFromEquivalencies(equivReps, filterFD, ev, relProps) - s.Selectivity *= sb.selectivityFromUnappliedConstraints(numUnappliedConstraints) + // Calculate selectivity and row count + // ----------------------------------- + s.RowCount = leftStats.RowCount * rightStats.RowCount + s.ApplySelectivity(sb.selectivityFromDistinctCounts(constrainedCols, ev, s)) + s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, filterFD, ev, s)) + s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(numUnappliedConstraints)) - // Calculate row count - // ------------------- - inputRowCount := leftStats.RowCount * rightStats.RowCount - sb.applySelectivity(inputRowCount, s) + // Update distinct counts based on equivalencies; this should happen after + // selectivityFromDistinctCounts and selectivityFromEquivalencies. + sb.applyEquivalencies(equivReps, filterFD, ev, relProps) // The above calculation is for inner joins. Tweak the row count and // distinct counts for other types of joins. @@ -772,7 +773,7 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, ev ExprView) *props. case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: // Column stats come from left side of join. colStat := sb.copyColStatFromChild(colSet, ev, 0 /* childIdx */, s) - sb.applySelectivityToColStat(colStat, s.Selectivity, leftStats.RowCount) + colStat.ApplySelectivity(s.Selectivity, leftStats.RowCount) return colStat default: @@ -801,13 +802,13 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, ev ExprView) *props. colStat = sb.copyColStatFromChild(leftCols, ev, 0 /* childIdx */, s) switch ev.Operator() { case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.RightJoinOp, opt.RightJoinApplyOp: - sb.applySelectivityToColStat(colStat, s.Selectivity, inputRowCount) + colStat.ApplySelectivity(s.Selectivity, inputRowCount) } } else if leftCols.Empty() { colStat = sb.copyColStatFromChild(rightCols, ev, 1 /* childIdx */, s) switch ev.Operator() { case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinOp, opt.LeftJoinApplyOp: - sb.applySelectivityToColStat(colStat, s.Selectivity, inputRowCount) + colStat.ApplySelectivity(s.Selectivity, inputRowCount) } } else { // Make a copy of the input column stats so we don't modify the originals. @@ -815,12 +816,14 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, ev ExprView) *props. rightColStat := *sb.colStatFromChild(rightCols, ev, 1 /* childIdx */) switch ev.Operator() { case opt.InnerJoinOp, opt.InnerJoinApplyOp: - sb.applySelectivityToColStat(&leftColStat, s.Selectivity, inputRowCount) - sb.applySelectivityToColStat(&rightColStat, s.Selectivity, inputRowCount) + leftColStat.ApplySelectivity(s.Selectivity, inputRowCount) + rightColStat.ApplySelectivity(s.Selectivity, inputRowCount) + case opt.LeftJoinOp, opt.LeftJoinApplyOp: - sb.applySelectivityToColStat(&rightColStat, s.Selectivity, inputRowCount) + rightColStat.ApplySelectivity(s.Selectivity, inputRowCount) + case opt.RightJoinOp, opt.RightJoinApplyOp: - sb.applySelectivityToColStat(&leftColStat, s.Selectivity, inputRowCount) + leftColStat.ApplySelectivity(s.Selectivity, inputRowCount) } colStat = sb.makeColStat(colSet, s) colStat.DistinctCount = leftColStat.DistinctCount * rightColStat.DistinctCount @@ -1056,7 +1059,7 @@ func (sb *statisticsBuilder) colStatLimit(colSet opt.ColSet, ev ExprView) *props colStat := sb.copyColStatFromChild(colSet, ev, 0 /* childIdx */, s) // Scale distinct count based on the selectivity of the limit operation. - sb.applySelectivityToColStat(colStat, s.Selectivity, inputStats.RowCount) + colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount) return colStat } @@ -1098,7 +1101,7 @@ func (sb *statisticsBuilder) colStatOffset(colSet opt.ColSet, ev ExprView) *prop colStat := sb.copyColStatFromChild(colSet, ev, 0 /* childIdx */, s) // Scale distinct count based on the selectivity of the offset operation. - sb.applySelectivityToColStat(colStat, s.Selectivity, inputStats.RowCount) + colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount) return colStat } @@ -1366,9 +1369,9 @@ const ( unknownDistinctCountRatio = 0.7 ) -// applyFilter uses constraints and FD equivalencies to update the distinct -// counts for the constrained columns in the filter. These distinct counts -// will be used later to determine the selectivity of the filter. +// applyFilter uses constraints to update the distinct counts for the +// constrained columns in the filter. The changes in the distinct counts will be +// used later to determine the selectivity of the filter. // // Some filters can be translated directly to distinct counts using the // constraint set. For example, the tight constraint `/a: [/1 - /1]` indicates @@ -1384,7 +1387,7 @@ const ( // See applyEquivalencies and selectivityFromEquivalencies for details. // func (sb *statisticsBuilder) applyFilter( - filter ExprView, equivReps opt.ColSet, ev ExprView, relProps *props.Relational, + filter ExprView, ev ExprView, relProps *props.Relational, ) (numUnappliedConstraints int, constrainedCols opt.ColSet) { constraintSet := filter.Logical().Scalar.Constraints tight := filter.Logical().Scalar.TightConstraints @@ -1425,8 +1428,6 @@ func (sb *statisticsBuilder) applyFilter( } } - filterFD := &filter.Logical().Scalar.FuncDeps - sb.applyEquivalencies(equivReps, filterFD, ev, relProps) return numUnappliedConstraints, constrainedCols } @@ -1595,7 +1596,7 @@ func (sb *statisticsBuilder) updateDistinctCountsFromEquivalency( s := &relProps.Stats // Find the minimum distinct count for all columns in this equivalency group. - minDistinctCount := math.MaxFloat64 + minDistinctCount := s.RowCount equivGroup.ForEach(func(i int) { col := opt.ColumnID(i) colStat, ok := s.ColStats[col] @@ -1628,83 +1629,62 @@ func (sb *statisticsBuilder) updateDistinctCountsFromEquivalency( // columns} // // This selectivity will be used later to update the row count and the -// distinct count for the unconstrained columns in applySelectivityToColStat. +// distinct count for the unconstrained columns. // -// If some of the columns are equivalent, this algorithm only uses one column -// from each equivalency group, and chooses the most selective column -// (i.e., the one with lowest selectivity). Otherwise, this algorithm assumes -// the columns are completely independent. +// This algorithm assumes the columns are completely independent. // func (sb *statisticsBuilder) selectivityFromDistinctCounts( - cols opt.ColSet, ev ExprView, relProps *props.Relational, + cols opt.ColSet, ev ExprView, s *props.Statistics, ) (selectivity float64) { - s := &relProps.Stats - fd := &relProps.FuncDeps - var seen opt.ColSet - selectivity = 1.0 for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) { - if seen.Contains(col) { - // If an equivalent column was already included in the selectivity - // calculation, don't include this one. - continue - } colStat, ok := s.ColStats[opt.ColumnID(col)] if !ok { continue } - localSelectivity := 1.0 - eqCols := fd.ComputeEquivClosure(colStat.Cols) - eqCols.ForEach(func(i int) { - col := opt.ColumnID(i) - if eqStat, ok := s.ColStats[col]; ok { - localSelectivity = min( - localSelectivity, sb.selectivityFromDistinctCount(eqStat, ev, relProps), - ) - } - }) - - seen.UnionWith(eqCols) - selectivity *= localSelectivity + inputStat := sb.colStatFromInput(colStat.Cols, ev) + if inputStat.DistinctCount != 0 && colStat.DistinctCount < inputStat.DistinctCount { + selectivity *= colStat.DistinctCount / inputStat.DistinctCount + } } return selectivity } -func (sb *statisticsBuilder) selectivityFromDistinctCount( - colStat *props.ColumnStatistic, ev ExprView, relProps *props.Relational, -) float64 { - inputStat := sb.colStatFromInput(colStat.Cols, ev) - if inputStat.DistinctCount != 0 && colStat.DistinctCount < inputStat.DistinctCount { - return colStat.DistinctCount / inputStat.DistinctCount - } - return 1.0 -} - +// selectivityFromEquivalencies determines the selectivity of equality +// constraints. It must be called before applyEquivalencies. func (sb *statisticsBuilder) selectivityFromEquivalencies( - equivReps opt.ColSet, filterFD *props.FuncDepSet, ev ExprView, relProps *props.Relational, + equivReps opt.ColSet, filterFD *props.FuncDepSet, ev ExprView, s *props.Statistics, ) (selectivity float64) { selectivity = 1.0 equivReps.ForEach(func(i int) { equivGroup := filterFD.ComputeEquivGroup(opt.ColumnID(i)) - selectivity *= sb.selectivityFromEquivalency(equivGroup, ev, relProps) + selectivity *= sb.selectivityFromEquivalency(equivGroup, ev, s) }) return selectivity } func (sb *statisticsBuilder) selectivityFromEquivalency( - equivGroup opt.ColSet, ev ExprView, relProps *props.Relational, + equivGroup opt.ColSet, ev ExprView, s *props.Statistics, ) (selectivity float64) { // Find the maximum input distinct count for all columns in this equivalency // group. maxDistinctCount := float64(0) equivGroup.ForEach(func(i int) { - inputColStat := sb.colStatFromInput(util.MakeFastIntSet(i), ev) - if maxDistinctCount < inputColStat.DistinctCount { - maxDistinctCount = inputColStat.DistinctCount + // If any of the distinct counts were updated by the filter, we want to use + // the updated value. + colStat, ok := s.ColStats[opt.ColumnID(i)] + if !ok { + colStat = sb.colStatFromInput(util.MakeFastIntSet(i), ev) + } + if maxDistinctCount < colStat.DistinctCount { + maxDistinctCount = colStat.DistinctCount } }) + if maxDistinctCount > s.RowCount { + maxDistinctCount = s.RowCount + } // The selectivity of an equality condition var1=var2 is // 1/max(distinct(var1), distinct(var2)). @@ -1715,59 +1695,6 @@ func (sb *statisticsBuilder) selectivityFromEquivalency( return selectivity } -// applySelectivityToColStat updates the given column statistics according to -// the filter selectivity. -func (sb *statisticsBuilder) applySelectivityToColStat( - colStat *props.ColumnStatistic, selectivity, inputRows float64, -) { - if selectivity == 0 || colStat.DistinctCount == 0 { - colStat.DistinctCount = 0 - return - } - - n := inputRows - d := colStat.DistinctCount - - // If each distinct value appears n/d times, and the probability of a - // row being filtered out is (1 - selectivity), the probability that all - // n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected - // number of values that are filtered out is d*(1 - selectivity)^(n/d). - // - // This formula returns d * selectivity when d=n but is closer to d - // when d << n. - colStat.DistinctCount = d - d*math.Pow(1-selectivity, n/d) -} - -// applySelectivity updates the row count according to the filter selectivity, -// and ensures that no distinct counts are larger than the row count. -func (sb *statisticsBuilder) applySelectivity(inputRows float64, s *props.Statistics) { - if s.Selectivity == 0 { - sb.updateStatsFromContradiction(s) - return - } - - s.RowCount = inputRows * s.Selectivity - - // At this point we only have single-column stats on columns that were - // constrained by the filter. Make sure none of the distinct counts are - // larger than the row count. - for _, colStat := range s.ColStats { - colStat.DistinctCount = min(colStat.DistinctCount, s.RowCount) - } -} - -// updateStatsFromContradiction sets the row count and distinct count to zero, -// since a contradiction results in 0 rows. -func (sb *statisticsBuilder) updateStatsFromContradiction(s *props.Statistics) { - s.RowCount = 0 - for i := range s.ColStats { - s.ColStats[i].DistinctCount = 0 - } - for i := range s.MultiColStats { - s.MultiColStats[i].DistinctCount = 0 - } -} - func (sb *statisticsBuilder) selectivityFromUnappliedConstraints( numUnappliedConstraints int, ) (selectivity float64) { diff --git a/pkg/sql/opt/memo/statistics_builder_test.go b/pkg/sql/opt/memo/statistics_builder_test.go index 257ac1224bdd..832a31ba4365 100644 --- a/pkg/sql/opt/memo/statistics_builder_test.go +++ b/pkg/sql/opt/memo/statistics_builder_test.go @@ -128,13 +128,10 @@ func TestGetStatsFromConstraint(t *testing.T) { // Calculate distinct counts. numUnappliedConstraints := sb.applyConstraintSet(cs, ev, relProps) - // Calculate selectivity. - s.Selectivity *= sb.selectivityFromDistinctCounts(cols, ev, relProps) - s.Selectivity *= sb.selectivityFromUnappliedConstraints(numUnappliedConstraints) - - // Calculate row count. - inputRows := mem.GroupProperties(scanGroup).Relational.Stats.RowCount - sb.applySelectivity(inputRows, s) + // Calculate row count and selectivity. + s.RowCount = mem.GroupProperties(scanGroup).Relational.Stats.RowCount + s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, ev, s)) + s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(numUnappliedConstraints)) // Check if the statistics match the expected value. testStats(t, s, expectedStats, expectedSelectivity) diff --git a/pkg/sql/opt/memo/testdata/stats/join b/pkg/sql/opt/memo/testdata/stats/join index 259ea2f8edb6..6ff13b99e4a5 100644 --- a/pkg/sql/opt/memo/testdata/stats/join +++ b/pkg/sql/opt/memo/testdata/stats/join @@ -716,8 +716,6 @@ TABLE xyz # In the first case, x=10 is pushed down; in the second case it is part of the # ON condition. The latter formulation happens in practice when we convert to # lookup join (we incorporate the filter back into the ON condition). -# TODO(radu): the second case has orders of magnitude smaller row count; fix -# this. norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight) SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN (SELECT * FROM xyz WHERE x=10) ON u=x @@ -752,7 +750,7 @@ SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN xyz ON u=x AND x=10 ---- inner-join ├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int) - ├── stats: [rows=0.0029154519, distinct(1)=0.0029154519, distinct(5)=0.0029154519] + ├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1] ├── fd: ()-->(1,3,5), (1)==(5), (5)==(1) ├── select │ ├── columns: u:1(int) v:2(int) w:3(int!null) diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select index e3951a8dbc7d..4adbc6ddf966 100644 --- a/pkg/sql/opt/memo/testdata/stats/select +++ b/pkg/sql/opt/memo/testdata/stats/select @@ -502,7 +502,7 @@ SELECT * FROM order_history WHERE item_id = order_id AND item_id = customer_id A ---- select ├── columns: order_id:1(int!null) item_id:2(int!null) customer_id:3(int!null) year:4(int) - ├── stats: [rows=0.00204081633, distinct(1)=0.00204081633, distinct(2)=0.00204081633, distinct(3)=0.00204081633] + ├── stats: [rows=1, distinct(1)=1, distinct(2)=1, distinct(3)=1] ├── fd: ()-->(1-3), (1)==(2,3), (2)==(1,3), (3)==(1,2) ├── scan order_history │ ├── columns: order_id:1(int) item_id:2(int) customer_id:3(int) year:4(int) @@ -602,14 +602,12 @@ TABLE uvw # Test selectivity calculations by applying the two constraints in different # orders. -# TODO(radu): applying both constraints at the same time results in much lower -# estimates. norm SELECT * FROM uvw WHERE u=v AND u=10 ---- select ├── columns: u:1(int!null) v:2(int!null) w:3(int) - ├── stats: [rows=0.00204081633, distinct(1)=0.00204081633, distinct(2)=0.00204081633] + ├── stats: [rows=1, distinct(1)=1, distinct(2)=1] ├── fd: ()-->(1,2), (1)==(2), (2)==(1) ├── scan uvw │ ├── columns: u:1(int) v:2(int) w:3(int) diff --git a/pkg/sql/opt/props/statistics.go b/pkg/sql/opt/props/statistics.go index b9ad8133969d..bcdd2656a838 100644 --- a/pkg/sql/opt/props/statistics.go +++ b/pkg/sql/opt/props/statistics.go @@ -17,6 +17,7 @@ package props import ( "bytes" "fmt" + "math" "sort" "github.com/cockroachdb/cockroach/pkg/sql/opt" @@ -100,6 +101,38 @@ func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) { return false } +// ApplySelectivity applies a given selectivity to the statistics. RowCount and +// Selectivity are updated. Note that DistinctCounts are not updated, other than +// limiting them to the new RowCount. See ColumnStatistic.ApplySelectivity for +// updating distinct counts. +func (s *Statistics) ApplySelectivity(selectivity float64) { + if selectivity == 0 { + s.RowCount = 0 + for i := range s.ColStats { + s.ColStats[i].DistinctCount = 0 + } + for i := range s.MultiColStats { + s.MultiColStats[i].DistinctCount = 0 + } + return + } + + s.RowCount *= selectivity + s.Selectivity *= selectivity + + // Make sure none of the distinct counts are larger than the row count. + for _, colStat := range s.ColStats { + if colStat.DistinctCount > s.RowCount { + colStat.DistinctCount = s.RowCount + } + } + for _, colStat := range s.MultiColStats { + if colStat.DistinctCount > s.RowCount { + colStat.DistinctCount = s.RowCount + } + } +} + // ColumnStatistic is a collection of statistics that applies to a particular // set of columns. In theory, a table could have a ColumnStatistic object // for every possible subset of columns. In practice, it is only worth @@ -115,6 +148,29 @@ type ColumnStatistic struct { DistinctCount float64 } +// ApplySelectivity updates the distinct count according to a given selectivity. +func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) { + if selectivity == 1 || c.DistinctCount == 0 { + return + } + if selectivity == 0 { + c.DistinctCount = 0 + return + } + + n := inputRows + d := c.DistinctCount + + // If each distinct value appears n/d times, and the probability of a + // row being filtered out is (1 - selectivity), the probability that all + // n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected + // number of values that are filtered out is d*(1 - selectivity)^(n/d). + // + // This formula returns d * selectivity when d=n but is closer to d + // when d << n. + c.DistinctCount = d - d*math.Pow(1-selectivity, n/d) +} + // ColumnStatistics is a slice of ColumnStatistic values. type ColumnStatistics []ColumnStatistic diff --git a/pkg/sql/opt/xform/testdata/external/tpcc b/pkg/sql/opt/xform/testdata/external/tpcc index aa3ce88a246f..7279a79fb975 100644 --- a/pkg/sql/opt/xform/testdata/external/tpcc +++ b/pkg/sql/opt/xform/testdata/external/tpcc @@ -895,15 +895,15 @@ scalar-group-by ├── columns: count:28(int) ├── cardinality: [1 - 1] ├── stats: [rows=1] - ├── cost: 0.0100267051 + ├── cost: 0.17464683 ├── key: () ├── fd: ()-->(28) ├── prune: (28) ├── inner-join (lookup stock) │ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null) │ ├── key columns: [3 5] = [12 11] - │ ├── stats: [rows=4.73185689e-06, distinct(3)=4.08163265e-06, distinct(5)=4.0816376e-06, distinct(11)=4.0816376e-06, distinct(12)=4.08163265e-06] - │ ├── cost: 2.66578201e-05 + │ ├── stats: [rows=0.136054422, distinct(3)=4.08163265e-06, distinct(5)=4.0816376e-06, distinct(11)=4.0816376e-06, distinct(12)=4.08163265e-06] + │ ├── cost: 0.163286286 │ ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3) │ ├── interesting orderings: (+3,+2,-1) │ ├── scan order_line