Skip to content

Commit

Permalink
Merge #28892
Browse files Browse the repository at this point in the history
28892: opt: reorganize selectivity calculations r=RaduBerinde a=RaduBerinde

#### opt: add disable flag to opttester

Add a flag to norm/opt that disables specific rules.

Release note: None

#### opt: add testcases showing stats problem

Release note: None

#### opt: reorganize selectivity calculations

Fixing a few issues with selectivity calculations, related to the
handling of constraints in conjunction with equivalencies. In
principle, applying the filter `a=10 AND a=b` should result in similar
stats with applying `a=10` and then applying `a=b` but this was not
the case.

Changes:
 - We were updating the distinct counts to take into account the
   equivalencies before we estimated selectivity. This can lead to the
   selectivity of some constraints effectively being applied twice.
   The call to applyEquivalencies is moved after the selectivity
   calculations.

 - When calculating selectivity from distinct counts we were taking
   into account equivalencies, even though we later apply the
   selectivity of equivalencies separately. The fix is to only look at
   the distinct count of each column, rather than taking the minimum
   across the equivalency group.

 - When calculating selectivity for equivalency, we were looking
   at the "input" distinct counts. But if some distinct counts were
   updated by a constraint, we want to use the updated values; so the
   code now checks `ColStats` first. An example with some intuition
   for this: when applying a filter `a=10 AND a=b`, the behavior
   should be the same with first applying filter `a=10` and then
   applying `a=b`. So the "input" to the equivalency calculation needs
   to reflect `a=10`.

 - Finally, another difference when handling a conjunction is that we
   only calculate RowCount and bound the distinct counts at the end.
   We now update the RowCount "in step" with Selectivity; in addition,
   RowCount is used as an upper bound when consulting "input" stats.

Release note: None


Co-authored-by: Radu Berinde <[email protected]>
  • Loading branch information
craig[bot] and RaduBerinde committed Aug 22, 2018
2 parents e62da97 + eb7d824 commit f1ae4cb
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 183 deletions.
213 changes: 70 additions & 143 deletions pkg/sql/opt/memo/statistics_builder.go

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions pkg/sql/opt/memo/statistics_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,10 @@ func TestGetStatsFromConstraint(t *testing.T) {
// Calculate distinct counts.
numUnappliedConstraints := sb.applyConstraintSet(cs, ev, relProps)

// Calculate selectivity.
s.Selectivity *= sb.selectivityFromDistinctCounts(cols, ev, relProps)
s.Selectivity *= sb.selectivityFromUnappliedConstraints(numUnappliedConstraints)

// Calculate row count.
inputRows := mem.GroupProperties(scanGroup).Relational.Stats.RowCount
sb.applySelectivity(inputRows, s)
// Calculate row count and selectivity.
s.RowCount = mem.GroupProperties(scanGroup).Relational.Stats.RowCount
s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, ev, s))
s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(numUnappliedConstraints))

// Check if the statistics match the expected value.
testStats(t, s, expectedStats, expectedSelectivity)
Expand Down
78 changes: 78 additions & 0 deletions pkg/sql/opt/memo/testdata/stats/join
Original file line number Diff line number Diff line change
Expand Up @@ -689,3 +689,81 @@ inner-join (merge)
├── right ordering: +5
└── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
└── a = e [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]

exec-ddl
CREATE TABLE uvw (u INT, v INT, w INT)
----
TABLE uvw
├── u int
├── v int
├── w int
├── rowid int not null (hidden)
└── INDEX primary
└── rowid int not null (hidden)

exec-ddl
CREATE TABLE xyz (x INT, y INT, z INT)
----
TABLE xyz
├── x int
├── y int
├── z int
├── rowid int not null (hidden)
└── INDEX primary
└── rowid int not null (hidden)

# Verify that two equivalent formulations of a join lead to similar statistics.
# In the first case, x=10 is pushed down; in the second case it is part of the
# ON condition. The latter formulation happens in practice when we convert to
# lookup join (we incorporate the filter back into the ON condition).

norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight)
SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN (SELECT * FROM xyz WHERE x=10) ON u=x
----
inner-join
├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int)
├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1]
├── fd: ()-->(1,3,5), (1)==(5), (5)==(1)
├── select
│ ├── columns: u:1(int) v:2(int) w:3(int!null)
│ ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1]
│ ├── fd: ()-->(3)
│ ├── scan uvw
│ │ ├── columns: u:1(int) v:2(int) w:3(int)
│ │ └── stats: [rows=1000, distinct(1)=700, distinct(3)=700]
│ └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)]
│ └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)]
├── select
│ ├── columns: x:5(int!null) y:6(int) z:7(int)
│ ├── stats: [rows=1.42857143, distinct(5)=1]
│ ├── fd: ()-->(5)
│ ├── scan xyz
│ │ ├── columns: x:5(int) y:6(int) z:7(int)
│ │ └── stats: [rows=1000, distinct(5)=700]
│ └── filters [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight), fd=()-->(5)]
│ └── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)]
└── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
└── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]

norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight)
SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN xyz ON u=x AND x=10
----
inner-join
├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int)
├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1]
├── fd: ()-->(1,3,5), (1)==(5), (5)==(1)
├── select
│ ├── columns: u:1(int) v:2(int) w:3(int!null)
│ ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1]
│ ├── fd: ()-->(3)
│ ├── scan uvw
│ │ ├── columns: u:1(int) v:2(int) w:3(int)
│ │ └── stats: [rows=1000, distinct(1)=700, distinct(3)=700]
│ └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)]
│ └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)]
├── scan xyz
│ ├── columns: x:5(int) y:6(int) z:7(int)
│ └── stats: [rows=1000, distinct(5)=700]
└── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: [/10 - /10]), fd=()-->(1,5), (1)==(5), (5)==(1)]
├── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]
└── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)]
67 changes: 66 additions & 1 deletion pkg/sql/opt/memo/testdata/stats/select
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ SELECT * FROM order_history WHERE item_id = order_id AND item_id = customer_id A
----
select
├── columns: order_id:1(int!null) item_id:2(int!null) customer_id:3(int!null) year:4(int)
├── stats: [rows=0.00204081633, distinct(1)=0.00204081633, distinct(2)=0.00204081633, distinct(3)=0.00204081633]
├── stats: [rows=1, distinct(1)=1, distinct(2)=1, distinct(3)=1]
├── fd: ()-->(1-3), (1)==(2,3), (2)==(1,3), (3)==(1,2)
├── scan order_history
│ ├── columns: order_id:1(int) item_id:2(int) customer_id:3(int) year:4(int)
Expand Down Expand Up @@ -588,3 +588,68 @@ select
└── filters [type=bool, outer=(1), constraints=(/1: [/0 - /99]; tight)]
├── x >= 0 [type=bool, outer=(1), constraints=(/1: [/0 - ]; tight)]
└── x < 100 [type=bool, outer=(1), constraints=(/1: (/NULL - /99]; tight)]

exec-ddl
CREATE TABLE uvw (u INT, v INT, w INT)
----
TABLE uvw
├── u int
├── v int
├── w int
├── rowid int not null (hidden)
└── INDEX primary
└── rowid int not null (hidden)

# Test selectivity calculations by applying the two constraints in different
# orders.
norm
SELECT * FROM uvw WHERE u=v AND u=10
----
select
├── columns: u:1(int!null) v:2(int!null) w:3(int)
├── stats: [rows=1, distinct(1)=1, distinct(2)=1]
├── fd: ()-->(1,2), (1)==(2), (2)==(1)
├── scan uvw
│ ├── columns: u:1(int) v:2(int) w:3(int)
│ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
└── filters [type=bool, outer=(1,2), constraints=(/1: [/10 - /10]; /2: (/NULL - ]), fd=()-->(1,2), (1)==(2), (2)==(1)]
├── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]
└── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]

norm disable=MergeSelects
SELECT * FROM (SELECT * FROM uvw WHERE u=10) WHERE u=v
----
select
├── columns: u:1(int!null) v:2(int!null) w:3(int)
├── stats: [rows=1.0003063, distinct(1)=1, distinct(2)=1]
├── fd: ()-->(1,2), (1)==(2), (2)==(1)
├── select
│ ├── columns: u:1(int!null) v:2(int) w:3(int)
│ ├── stats: [rows=1.42857143, distinct(1)=1, distinct(2)=1.42813399]
│ ├── fd: ()-->(1)
│ ├── scan uvw
│ │ ├── columns: u:1(int) v:2(int) w:3(int)
│ │ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
│ └── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)]
│ └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]
└── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)]
└── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]

norm disable=MergeSelects
SELECT * FROM (SELECT * FROM uvw WHERE u=v) WHERE u=10
----
select
├── columns: u:1(int!null) v:2(int!null) w:3(int)
├── stats: [rows=1, distinct(1)=1]
├── fd: ()-->(1,2), (1)==(2), (2)==(1)
├── select
│ ├── columns: u:1(int!null) v:2(int!null) w:3(int)
│ ├── stats: [rows=1.42857143, distinct(1)=1.42857143, distinct(2)=1.42857143]
│ ├── fd: (1)==(2), (2)==(1)
│ ├── scan uvw
│ │ ├── columns: u:1(int) v:2(int) w:3(int)
│ │ └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
│ └── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)]
│ └── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]
└── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)]
└── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]
56 changes: 56 additions & 0 deletions pkg/sql/opt/props/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package props
import (
"bytes"
"fmt"
"math"
"sort"

"github.com/cockroachdb/cockroach/pkg/sql/opt"
Expand Down Expand Up @@ -100,6 +101,38 @@ func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) {
return false
}

// ApplySelectivity applies a given selectivity to the statistics. RowCount and
// Selectivity are updated. Note that DistinctCounts are not updated, other than
// limiting them to the new RowCount. See ColumnStatistic.ApplySelectivity for
// updating distinct counts.
func (s *Statistics) ApplySelectivity(selectivity float64) {
if selectivity == 0 {
s.RowCount = 0
for i := range s.ColStats {
s.ColStats[i].DistinctCount = 0
}
for i := range s.MultiColStats {
s.MultiColStats[i].DistinctCount = 0
}
return
}

s.RowCount *= selectivity
s.Selectivity *= selectivity

// Make sure none of the distinct counts are larger than the row count.
for _, colStat := range s.ColStats {
if colStat.DistinctCount > s.RowCount {
colStat.DistinctCount = s.RowCount
}
}
for _, colStat := range s.MultiColStats {
if colStat.DistinctCount > s.RowCount {
colStat.DistinctCount = s.RowCount
}
}
}

// ColumnStatistic is a collection of statistics that applies to a particular
// set of columns. In theory, a table could have a ColumnStatistic object
// for every possible subset of columns. In practice, it is only worth
Expand All @@ -115,6 +148,29 @@ type ColumnStatistic struct {
DistinctCount float64
}

// ApplySelectivity updates the distinct count according to a given selectivity.
func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) {
if selectivity == 1 || c.DistinctCount == 0 {
return
}
if selectivity == 0 {
c.DistinctCount = 0
return
}

n := inputRows
d := c.DistinctCount

// If each distinct value appears n/d times, and the probability of a
// row being filtered out is (1 - selectivity), the probability that all
// n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected
// number of values that are filtered out is d*(1 - selectivity)^(n/d).
//
// This formula returns d * selectivity when d=n but is closer to d
// when d << n.
c.DistinctCount = d - d*math.Pow(1-selectivity, n/d)
}

// ColumnStatistics is a slice of ColumnStatistic values.
type ColumnStatistics []ColumnStatistic

Expand Down
Loading

0 comments on commit f1ae4cb

Please sign in to comment.