Merge #28892

28892: opt: reorganize selectivity calculations r=RaduBerinde a=RaduBerinde #### opt: add disable flag to opttester Add a flag to norm/opt that disables specific rules. Release note: None #### opt: add testcases showing stats problem Release note: None #### opt: reorganize selectivity calculations Fixing a few issues with selectivity calculations, related to the handling of constraints in conjunction with equivalencies. In principle, applying the filter `a=10 AND a=b` should result in similar stats with applying `a=10` and then applying `a=b` but this was not the case. Changes: - We were updating the distinct counts to take into account the equivalencies before we estimated selectivity. This can lead to the selectivity of some constraints effectively being applied twice. The call to applyEquivalencies is moved after the selectivity calculations. - When calculating selectivity from distinct counts we were taking into account equivalencies, even though we later apply the selectivity of equivalencies separately. The fix is to only look at the distinct count of each column, rather than taking the minimum across the equivalency group. - When calculating selectivity for equivalency, we were looking at the "input" distinct counts. But if some distinct counts were updated by a constraint, we want to use the updated values; so the code now checks `ColStats` first. An example with some intuition for this: when applying a filter `a=10 AND a=b`, the behavior should be the same with first applying filter `a=10` and then applying `a=b`. So the "input" to the equivalency calculation needs to reflect `a=10`. - Finally, another difference when handling a conjunction is that we only calculate RowCount and bound the distinct counts at the end. We now update the RowCount "in step" with Selectivity; in addition, RowCount is used as an upper bound when consulting "input" stats. Release note: None Co-authored-by: Radu Berinde <[email protected]>
cockroachdb · Aug 22, 2018 · f1ae4cb · f1ae4cb
2 parents e62da97 + eb7d824
commit f1ae4cb
Show file tree

Hide file tree

Showing 8 changed files with 348 additions and 183 deletions.
diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
diff --git a/pkg/sql/opt/memo/statistics_builder_test.go b/pkg/sql/opt/memo/statistics_builder_test.go
@@ -128,13 +128,10 @@ func TestGetStatsFromConstraint(t *testing.T) {
 		// Calculate distinct counts.
 		numUnappliedConstraints := sb.applyConstraintSet(cs, ev, relProps)
 
-		// Calculate selectivity.
-		s.Selectivity *= sb.selectivityFromDistinctCounts(cols, ev, relProps)
-		s.Selectivity *= sb.selectivityFromUnappliedConstraints(numUnappliedConstraints)
-
-		// Calculate row count.
-		inputRows := mem.GroupProperties(scanGroup).Relational.Stats.RowCount
-		sb.applySelectivity(inputRows, s)
+		// Calculate row count and selectivity.
+		s.RowCount = mem.GroupProperties(scanGroup).Relational.Stats.RowCount
+		s.ApplySelectivity(sb.selectivityFromDistinctCounts(cols, ev, s))
+		s.ApplySelectivity(sb.selectivityFromUnappliedConstraints(numUnappliedConstraints))
 
 		// Check if the statistics match the expected value.
 		testStats(t, s, expectedStats, expectedSelectivity)

diff --git a/pkg/sql/opt/memo/testdata/stats/join b/pkg/sql/opt/memo/testdata/stats/join
@@ -689,3 +689,81 @@ inner-join (merge)
       ├── right ordering: +5
       └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
            └── a = e [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]
+
+exec-ddl
+CREATE TABLE uvw (u INT, v INT, w INT)
+----
+TABLE uvw
+ ├── u int
+ ├── v int
+ ├── w int
+ ├── rowid int not null (hidden)
+ └── INDEX primary
+      └── rowid int not null (hidden)
+
+exec-ddl
+CREATE TABLE xyz (x INT, y INT, z INT)
+----
+TABLE xyz
+ ├── x int
+ ├── y int
+ ├── z int
+ ├── rowid int not null (hidden)
+ └── INDEX primary
+      └── rowid int not null (hidden)
+
+# Verify that two equivalent formulations of a join lead to similar statistics.
+# In the first case, x=10 is pushed down; in the second case it is part of the
+# ON condition. The latter formulation happens in practice when we convert to
+# lookup join (we incorporate the filter back into the ON condition).
+
+norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight)
+SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN (SELECT * FROM xyz WHERE x=10) ON u=x
+----
+inner-join
+ ├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int)
+ ├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1]
+ ├── fd: ()-->(1,3,5), (1)==(5), (5)==(1)
+ ├── select
+ │    ├── columns: u:1(int) v:2(int) w:3(int!null)
+ │    ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1]
+ │    ├── fd: ()-->(3)
+ │    ├── scan uvw
+ │    │    ├── columns: u:1(int) v:2(int) w:3(int)
+ │    │    └── stats: [rows=1000, distinct(1)=700, distinct(3)=700]
+ │    └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)]
+ │         └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)]
+ ├── select
+ │    ├── columns: x:5(int!null) y:6(int) z:7(int)
+ │    ├── stats: [rows=1.42857143, distinct(5)=1]
+ │    ├── fd: ()-->(5)
+ │    ├── scan xyz
+ │    │    ├── columns: x:5(int) y:6(int) z:7(int)
+ │    │    └── stats: [rows=1000, distinct(5)=700]
+ │    └── filters [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight), fd=()-->(5)]
+ │         └── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)]
+ └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ]), fd=(1)==(5), (5)==(1)]
+      └── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]
+
+norm disable=(PushFilterIntoJoinLeftAndRight,PushFilterIntoJoinLeft,PushFilterIntoJoinRight,MapFilterIntoJoinLeft,MapFilterIntoJoinRight)
+SELECT * FROM (SELECT * FROM uvw WHERE w=1) JOIN xyz ON u=x AND x=10
+----
+inner-join
+ ├── columns: u:1(int!null) v:2(int) w:3(int!null) x:5(int!null) y:6(int) z:7(int)
+ ├── stats: [rows=1.429009, distinct(1)=1, distinct(5)=1]
+ ├── fd: ()-->(1,3,5), (1)==(5), (5)==(1)
+ ├── select
+ │    ├── columns: u:1(int) v:2(int) w:3(int!null)
+ │    ├── stats: [rows=1.42857143, distinct(1)=1.42813399, distinct(3)=1]
+ │    ├── fd: ()-->(3)
+ │    ├── scan uvw
+ │    │    ├── columns: u:1(int) v:2(int) w:3(int)
+ │    │    └── stats: [rows=1000, distinct(1)=700, distinct(3)=700]
+ │    └── filters [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight), fd=()-->(3)]
+ │         └── w = 1 [type=bool, outer=(3), constraints=(/3: [/1 - /1]; tight)]
+ ├── scan xyz
+ │    ├── columns: x:5(int) y:6(int) z:7(int)
+ │    └── stats: [rows=1000, distinct(5)=700]
+ └── filters [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: [/10 - /10]), fd=()-->(1,5), (1)==(5), (5)==(1)]
+      ├── u = x [type=bool, outer=(1,5), constraints=(/1: (/NULL - ]; /5: (/NULL - ])]
+      └── x = 10 [type=bool, outer=(5), constraints=(/5: [/10 - /10]; tight)]
diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select
@@ -502,7 +502,7 @@ SELECT * FROM order_history WHERE item_id = order_id AND item_id = customer_id A
 ----
 select
  ├── columns: order_id:1(int!null) item_id:2(int!null) customer_id:3(int!null) year:4(int)
- ├── stats: [rows=0.00204081633, distinct(1)=0.00204081633, distinct(2)=0.00204081633, distinct(3)=0.00204081633]
+ ├── stats: [rows=1, distinct(1)=1, distinct(2)=1, distinct(3)=1]
  ├── fd: ()-->(1-3), (1)==(2,3), (2)==(1,3), (3)==(1,2)
  ├── scan order_history
  │    ├── columns: order_id:1(int) item_id:2(int) customer_id:3(int) year:4(int)
@@ -588,3 +588,68 @@ select
  └── filters [type=bool, outer=(1), constraints=(/1: [/0 - /99]; tight)]
       ├── x >= 0 [type=bool, outer=(1), constraints=(/1: [/0 - ]; tight)]
       └── x < 100 [type=bool, outer=(1), constraints=(/1: (/NULL - /99]; tight)]
+
+exec-ddl
+CREATE TABLE uvw (u INT, v INT, w INT)
+----
+TABLE uvw
+ ├── u int
+ ├── v int
+ ├── w int
+ ├── rowid int not null (hidden)
+ └── INDEX primary
+      └── rowid int not null (hidden)
+
+# Test selectivity calculations by applying the two constraints in different
+# orders.
+norm
+SELECT * FROM uvw WHERE u=v AND u=10
+----
+select
+ ├── columns: u:1(int!null) v:2(int!null) w:3(int)
+ ├── stats: [rows=1, distinct(1)=1, distinct(2)=1]
+ ├── fd: ()-->(1,2), (1)==(2), (2)==(1)
+ ├── scan uvw
+ │    ├── columns: u:1(int) v:2(int) w:3(int)
+ │    └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
+ └── filters [type=bool, outer=(1,2), constraints=(/1: [/10 - /10]; /2: (/NULL - ]), fd=()-->(1,2), (1)==(2), (2)==(1)]
+      ├── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]
+      └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]
+
+norm disable=MergeSelects
+SELECT * FROM (SELECT * FROM uvw WHERE u=10) WHERE u=v
+----
+select
+ ├── columns: u:1(int!null) v:2(int!null) w:3(int)
+ ├── stats: [rows=1.0003063, distinct(1)=1, distinct(2)=1]
+ ├── fd: ()-->(1,2), (1)==(2), (2)==(1)
+ ├── select
+ │    ├── columns: u:1(int!null) v:2(int) w:3(int)
+ │    ├── stats: [rows=1.42857143, distinct(1)=1, distinct(2)=1.42813399]
+ │    ├── fd: ()-->(1)
+ │    ├── scan uvw
+ │    │    ├── columns: u:1(int) v:2(int) w:3(int)
+ │    │    └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
+ │    └── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)]
+ │         └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]
+ └── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)]
+      └── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]
+
+norm disable=MergeSelects
+SELECT * FROM (SELECT * FROM uvw WHERE u=v) WHERE u=10
+----
+select
+ ├── columns: u:1(int!null) v:2(int!null) w:3(int)
+ ├── stats: [rows=1, distinct(1)=1]
+ ├── fd: ()-->(1,2), (1)==(2), (2)==(1)
+ ├── select
+ │    ├── columns: u:1(int!null) v:2(int!null) w:3(int)
+ │    ├── stats: [rows=1.42857143, distinct(1)=1.42857143, distinct(2)=1.42857143]
+ │    ├── fd: (1)==(2), (2)==(1)
+ │    ├── scan uvw
+ │    │    ├── columns: u:1(int) v:2(int) w:3(int)
+ │    │    └── stats: [rows=1000, distinct(1)=700, distinct(2)=700]
+ │    └── filters [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ]), fd=(1)==(2), (2)==(1)]
+ │         └── u = v [type=bool, outer=(1,2), constraints=(/1: (/NULL - ]; /2: (/NULL - ])]
+ └── filters [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight), fd=()-->(1)]
+      └── u = 10 [type=bool, outer=(1), constraints=(/1: [/10 - /10]; tight)]
diff --git a/pkg/sql/opt/props/statistics.go b/pkg/sql/opt/props/statistics.go
@@ -17,6 +17,7 @@ package props
 import (
 	"bytes"
 	"fmt"
+	"math"
 	"sort"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/opt"
@@ -100,6 +101,38 @@ func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) {
 	return false
 }
 
+// ApplySelectivity applies a given selectivity to the statistics. RowCount and
+// Selectivity are updated. Note that DistinctCounts are not updated, other than
+// limiting them to the new RowCount. See ColumnStatistic.ApplySelectivity for
+// updating distinct counts.
+func (s *Statistics) ApplySelectivity(selectivity float64) {
+	if selectivity == 0 {
+		s.RowCount = 0
+		for i := range s.ColStats {
+			s.ColStats[i].DistinctCount = 0
+		}
+		for i := range s.MultiColStats {
+			s.MultiColStats[i].DistinctCount = 0
+		}
+		return
+	}
+
+	s.RowCount *= selectivity
+	s.Selectivity *= selectivity
+
+	// Make sure none of the distinct counts are larger than the row count.
+	for _, colStat := range s.ColStats {
+		if colStat.DistinctCount > s.RowCount {
+			colStat.DistinctCount = s.RowCount
+		}
+	}
+	for _, colStat := range s.MultiColStats {
+		if colStat.DistinctCount > s.RowCount {
+			colStat.DistinctCount = s.RowCount
+		}
+	}
+}
+
 // ColumnStatistic is a collection of statistics that applies to a particular
 // set of columns. In theory, a table could have a ColumnStatistic object
 // for every possible subset of columns. In practice, it is only worth
@@ -115,6 +148,29 @@ type ColumnStatistic struct {
 	DistinctCount float64
 }
 
+// ApplySelectivity updates the distinct count according to a given selectivity.
+func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) {
+	if selectivity == 1 || c.DistinctCount == 0 {
+		return
+	}
+	if selectivity == 0 {
+		c.DistinctCount = 0
+		return
+	}
+
+	n := inputRows
+	d := c.DistinctCount
+
+	// If each distinct value appears n/d times, and the probability of a
+	// row being filtered out is (1 - selectivity), the probability that all
+	// n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected
+	// number of values that are filtered out is d*(1 - selectivity)^(n/d).
+	//
+	// This formula returns d * selectivity when d=n but is closer to d
+	// when d << n.
+	c.DistinctCount = d - d*math.Pow(1-selectivity, n/d)
+}
+
 // ColumnStatistics is a slice of ColumnStatistic values.
 type ColumnStatistics []ColumnStatistic