Skip to content

Commit

Permalink
opt: create a special type for selectivity to clean up stats code
Browse files Browse the repository at this point in the history
Previously, the selectivity of a filter was represented by a float64 type.
There were several places in the statistics code where range checks were
being performed to ensure selectivity was between (0,1].

This change cleans up the statistics code and creates a Selectivity type with
custom methods to replace direct numerical operations, incorporating the range
check to ensure all operations on selectivity return a valid value.

As a result of implementing these changes, there are slight changes to some
values in the test files. The query plans and performance mostly stay the same.

Resolves: cockroachdb#53860

Release note: None
  • Loading branch information
angelazxu committed Jan 26, 2021
1 parent 7b0ccdd commit d5d8d51
Show file tree
Hide file tree
Showing 43 changed files with 1,593 additions and 1,560 deletions.
4 changes: 2 additions & 2 deletions pkg/sql/opt/exec/execbuilder/testdata/inverted_index
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ inner-join (lookup geo_table)
│ ├── columns: geo_table2.k:1 geo_table2.geom:2 geo_table.k:5
│ ├── inverted-expr
│ │ └── st_intersects(geo_table2.geom:2, geo_table.geom:6)
│ ├── stats: [rows=10000, distinct(1)=999.956829, null(1)=0, distinct(5)=999.956829, null(5)=0]
│ ├── stats: [rows=10000, distinct(1)=1000, null(1)=0, distinct(5)=1000, null(5)=0]
│ ├── cost: 41784.03
│ ├── key: (1,5)
│ ├── fd: (1)-->(2)
Expand Down Expand Up @@ -1075,7 +1075,7 @@ semi-join (lookup geo_table)
│ ├── columns: geo_table2.k:1 geo_table2.geom:2 geo_table.k:5 continuation:11
│ ├── inverted-expr
│ │ └── st_intersects(geo_table2.geom:2, geo_table.geom:6)
│ ├── stats: [rows=10000, distinct(1)=999.956829, null(1)=0]
│ ├── stats: [rows=10000, distinct(1)=1000, null(1)=0]
│ ├── cost: 41984.03
│ ├── key: (1,5)
│ ├── fd: (1)-->(2), (5)-->(11)
Expand Down
210 changes: 105 additions & 105 deletions pkg/sql/opt/exec/execbuilder/testdata/join

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pkg/sql/opt/exec/execbuilder/testdata/lookup_join
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ vectorized: true
·
• distinct
│ columns: (name)
│ estimated row count: 96
│ estimated row count: 100
│ distinct on: name
└── • project
Expand Down
107 changes: 57 additions & 50 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,7 @@ func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relationa
// calculate selectivity = 1/9 + 1/9 + 1/9 = 1/3 in spanStatsUnion, which
// is too high. Instead, we should use the value calculated from the
// combined spans, which in this case is simply 1/9.
s.Selectivity = min(s.Selectivity, spanStatsUnion.Selectivity)
s.Selectivity = minSelectivity(s.Selectivity, spanStatsUnion.Selectivity)
s.RowCount = min(s.RowCount, spanStatsUnion.RowCount)

sb.finalizeFromCardinality(relProps)
Expand Down Expand Up @@ -885,7 +885,7 @@ func (sb *statisticsBuilder) colStatSelect(
// filter conditions were pushed down into the input after s.Selectivity
// was calculated. For example, an index scan or index join created during
// exploration could absorb some of the filter conditions.
selectivity := s.RowCount / inputStats.RowCount
selectivity := props.Selectivity(s.RowCount / inputStats.RowCount)
colStat.ApplySelectivity(selectivity, inputStats.RowCount)
if colSet.Intersects(relProps.NotNullCols) {
colStat.NullCount = 0
Expand Down Expand Up @@ -1160,7 +1160,7 @@ func (sb *statisticsBuilder) buildJoin(
// This is like an index join, so apply a selectivity that will result
// in leftStats.RowCount rows.
if rightStats.RowCount != 0 {
s.ApplySelectivity(1 / rightStats.RowCount)
s.ApplySelectivity(props.Selectivity(1 / rightStats.RowCount))
}
} else {
// Add the self join columns to equivReps so they are included in the
Expand Down Expand Up @@ -1238,7 +1238,7 @@ func (sb *statisticsBuilder) buildJoin(
switch h.joinType {
case opt.AntiJoinOp, opt.AntiJoinApplyOp:
s.RowCount = max(leftStats.RowCount-s.RowCount, epsilon)
s.Selectivity = max(1-s.Selectivity, epsilon)
s.Selectivity = (1-s.Selectivity).SelectivityInRange()

// Converting column stats is error-prone. If any column stats are needed,
// colStatJoin will use the selectivity calculated above to estimate the
Expand Down Expand Up @@ -1421,7 +1421,7 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
rightNullCount,
rightProps.Stats.RowCount,
s.RowCount,
s.Selectivity*inputRowCount,
float64(s.Selectivity)*inputRowCount,
)

// Ensure distinct count is non-zero.
Expand Down Expand Up @@ -1594,7 +1594,7 @@ func (sb *statisticsBuilder) colStatIndexJoin(
// of any filters on the input.
inputStats := &inputProps.Stats
tableStats := sb.makeTableStatistics(join.Table)
selectivity := inputStats.RowCount / tableStats.RowCount
selectivity := props.Selectivity(inputStats.RowCount / tableStats.RowCount)
lookupColStat.ApplySelectivity(selectivity, tableStats.RowCount)

// Multiply the distinct counts in case colStat.DistinctCount is
Expand Down Expand Up @@ -1996,7 +1996,7 @@ func (sb *statisticsBuilder) buildLimit(limit *LimitExpr, relProps *props.Relati
hardLimit := *cnst.Value.(*tree.DInt)
if hardLimit > 0 {
s.RowCount = min(float64(hardLimit), inputStats.RowCount)
s.Selectivity = s.RowCount / inputStats.RowCount
s.Selectivity = props.Selectivity(s.RowCount / inputStats.RowCount)
}
}

Expand Down Expand Up @@ -2049,7 +2049,7 @@ func (sb *statisticsBuilder) buildOffset(offset *OffsetExpr, relProps *props.Rel
} else if hardOffset > 0 {
s.RowCount = inputStats.RowCount - float64(hardOffset)
}
s.Selectivity = s.RowCount / inputStats.RowCount
s.Selectivity = props.Selectivity(s.RowCount / inputStats.RowCount)
}

sb.finalizeFromCardinality(relProps)
Expand Down Expand Up @@ -2577,7 +2577,7 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
if colStat.Histogram != nil {
valuesCount := colStat.Histogram.ValuesCount()
if valuesCount > rowCount {
colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
colStat.Histogram = colStat.Histogram.ApplySelectivity(props.Selectivity(rowCount / valuesCount))
}
}
}
Expand Down Expand Up @@ -2821,6 +2821,13 @@ const (
multiColWeight = 9.0 / 10.0
)

func minSelectivity(a, b props.Selectivity) props.Selectivity {
if a < b {
return a
}
return b
}

// countPaths returns the number of JSON or Array paths in the specified
// FiltersItem. Used in the calculation of unapplied conjuncts in a
// Contains operator. Returns 0 if paths could not be counted for any
Expand Down Expand Up @@ -3491,7 +3498,7 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
//
func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
cols opt.ColSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
// Respect the session setting OptimizerUseMultiColStats.
if !sb.evalCtx.SessionData.OptimizerUseMultiColStats {
return sb.selectivityFromSingleColDistinctCounts(cols, e, s)
Expand All @@ -3502,11 +3509,11 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(

// First calculate the selectivity from equation (1) (see function comment),
// and collect the inputs to equation (2).
singleColSelectivity := 1.0
singleColSelectivity := props.Selectivity(1.0)
newDistinctProduct, oldDistinctProduct := 1.0, 1.0
maxNewDistinct, maxOldDistinct := float64(0), float64(0)
multiColNullCount := -1.0
minLocalSel := math.MaxFloat64
minLocalSel := props.Selectivity(math.MaxFloat64)
for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
if !ok {
Expand All @@ -3516,7 +3523,7 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(

inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
localSel := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
singleColSelectivity *= localSel
singleColSelectivity = singleColSelectivity.Multiply(localSel)

// Don't bother including columns in the multi-column calculation that
// don't contribute to the selectivity.
Expand Down Expand Up @@ -3606,7 +3613,7 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
}
}
}
multiColSelectivity = min(multiColSelectivity, minLocalSel)
multiColSelectivity = minSelectivity(multiColSelectivity, minLocalSel)

// As described in the function comment, we actually return a weighted sum
// of multi-column and single-column selectivity estimates.
Expand All @@ -3620,7 +3627,7 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
// comment above that function for details.
func (sb *statisticsBuilder) selectivityFromSingleColDistinctCounts(
cols opt.ColSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
selectivity = 1.0
for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
Expand All @@ -3629,20 +3636,20 @@ func (sb *statisticsBuilder) selectivityFromSingleColDistinctCounts(
}

inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
selectivity *= sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
selectivity = selectivity.Multiply(sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount))
}

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

// selectivityFromDistinctCount calculates the selectivity of a filter by using
// the estimated distinct count of a single constrained column or set of
// columns before and after the filter was applied.
func (sb *statisticsBuilder) selectivityFromDistinctCount(
colStat, inputColStat *props.ColumnStatistic, inputRowCount float64,
) float64 {
) props.Selectivity {
newDistinct := colStat.DistinctCount
oldDistinct := inputColStat.DistinctCount

Expand All @@ -3656,8 +3663,8 @@ func (sb *statisticsBuilder) selectivityFromDistinctCount(
}

// Calculate the selectivity of the predicate.
nonNullSelectivity := fraction(newDistinct, oldDistinct)
nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount)
nonNullSelectivity := props.Selectivity(fraction(newDistinct, oldDistinct))
nullSelectivity := props.Selectivity(fraction(colStat.NullCount, inputColStat.NullCount))
return sb.predicateSelectivity(
nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputRowCount,
)
Expand All @@ -3671,7 +3678,7 @@ func (sb *statisticsBuilder) selectivityFromDistinctCount(
// (# values in histogram after filter) / (# values in histogram before filter).
func (sb *statisticsBuilder) selectivityFromHistograms(
cols opt.ColSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
selectivity = 1.0
for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
Expand All @@ -3690,16 +3697,16 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
oldCount := oldHist.ValuesCount()

// Calculate the selectivity of the predicate.
nonNullSelectivity := fraction(newCount, oldCount)
nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount)
selectivity *= sb.predicateSelectivity(
nonNullSelectivity := props.Selectivity(fraction(newCount, oldCount))
nullSelectivity := props.Selectivity(fraction(colStat.NullCount, inputColStat.NullCount))
selectivity = selectivity.Multiply(sb.predicateSelectivity(
nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputStats.RowCount,
)
))
}

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

// selectivityFromNullsRemoved calculates the selectivity from null-rejecting
Expand All @@ -3708,23 +3715,23 @@ func (sb *statisticsBuilder) selectivityFromHistograms(
// should be designated by ignoreCols.
func (sb *statisticsBuilder) selectivityFromNullsRemoved(
e RelExpr, notNullCols opt.ColSet, ignoreCols opt.ColSet,
) (selectivity float64) {
) (selectivity props.Selectivity) {
selectivity = 1.0
notNullCols.ForEach(func(col opt.ColumnID) {
if !ignoreCols.Contains(col) {
inputColStat, inputStats := sb.colStatFromInput(opt.MakeColSet(col), e)
selectivity *= sb.predicateSelectivity(
selectivity = selectivity.Multiply(sb.predicateSelectivity(
1, /* nonNullSelectivity */
0, /* nullSelectivity */
inputColStat.NullCount,
inputStats.RowCount,
)
))
}
})

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

// predicateSelectivity calculates the selectivity of a predicate, using the
Expand All @@ -3739,35 +3746,35 @@ func (sb *statisticsBuilder) selectivityFromNullsRemoved(
// (fraction of null values preserved) * (number of null input rows)
//
func (sb *statisticsBuilder) predicateSelectivity(
nonNullSelectivity, nullSelectivity, inputNullCount, inputRowCount float64,
) float64 {
outRowCount := nonNullSelectivity*(inputRowCount-inputNullCount) + nullSelectivity*inputNullCount
sel := outRowCount / inputRowCount
nonNullSelectivity, nullSelectivity props.Selectivity, inputNullCount, inputRowCount float64,
) props.Selectivity {
outRowCount := float64(nonNullSelectivity)*(inputRowCount-inputNullCount) + float64(nullSelectivity)*inputNullCount
sel := props.MakeSelectivity(outRowCount / inputRowCount)

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(sel, epsilon)
return sel
}

// selectivityFromEquivalencies determines the selectivity of equality
// constraints. It must be called before applyEquivalencies.
func (sb *statisticsBuilder) selectivityFromEquivalencies(
equivReps opt.ColSet, filterFD *props.FuncDepSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
selectivity = 1.0
equivReps.ForEach(func(i opt.ColumnID) {
equivGroup := filterFD.ComputeEquivGroup(i)
selectivity *= sb.selectivityFromEquivalency(equivGroup, e, s)
selectivity = selectivity.Multiply(sb.selectivityFromEquivalency(equivGroup, e, s))
})

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

func (sb *statisticsBuilder) selectivityFromEquivalency(
equivGroup opt.ColSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
// Find the maximum input distinct count for all columns in this equivalency
// group.
maxDistinctCount := float64(0)
Expand All @@ -3789,7 +3796,7 @@ func (sb *statisticsBuilder) selectivityFromEquivalency(

// The selectivity of an equality condition var1=var2 is
// 1/max(distinct(var1), distinct(var2)).
return fraction(1, maxDistinctCount)
return props.Selectivity(fraction(1, maxDistinctCount))
}

// selectivityFromEquivalenciesSemiJoin determines the selectivity of equality
Expand All @@ -3799,23 +3806,23 @@ func (sb *statisticsBuilder) selectivityFromEquivalenciesSemiJoin(
filterFD *props.FuncDepSet,
e RelExpr,
s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
selectivity = 1.0
equivReps.ForEach(func(i opt.ColumnID) {
equivGroup := filterFD.ComputeEquivGroup(i)
selectivity *= sb.selectivityFromEquivalencySemiJoin(
selectivity = selectivity.Multiply(sb.selectivityFromEquivalencySemiJoin(
equivGroup, leftOutputCols, rightOutputCols, e, s,
)
))
})

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

func (sb *statisticsBuilder) selectivityFromEquivalencySemiJoin(
equivGroup, leftOutputCols, rightOutputCols opt.ColSet, e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
// Find the minimum (maximum) input distinct count for all columns in this
// equivalency group from the right (left).
minDistinctCountRight := math.MaxFloat64
Expand All @@ -3842,23 +3849,23 @@ func (sb *statisticsBuilder) selectivityFromEquivalencySemiJoin(
maxDistinctCountLeft = s.RowCount
}

return fraction(minDistinctCountRight, maxDistinctCountLeft)
return props.Selectivity(fraction(minDistinctCountRight, maxDistinctCountLeft))
}

func (sb *statisticsBuilder) selectivityFromInvertedJoinCondition(
e RelExpr, s *props.Statistics,
) (selectivity float64) {
) (selectivity props.Selectivity) {
return unknownInvertedJoinSelectivity
}

func (sb *statisticsBuilder) selectivityFromUnappliedConjuncts(
numUnappliedConjuncts float64,
) (selectivity float64) {
selectivity = math.Pow(unknownFilterSelectivity, numUnappliedConjuncts)
) (selectivity props.Selectivity) {
selectivity = props.MakeSelectivity(math.Pow(unknownFilterSelectivity, numUnappliedConjuncts))

// Avoid setting selectivity to 0. The stats may be stale, and we
// can end up with weird and inefficient plans if we estimate 0 rows.
return max(selectivity, epsilon)
return selectivity
}

// tryReduceCols is used to determine which columns to use for selectivity
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/opt/memo/testdata/stats/inverted-geo
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ project
│ │ ├── columns: rowid:3(int!null) g_inverted_key:5(geometry!null)
│ │ ├── inverted constraint: /5/3
│ │ │ └── spans: ["B\xfd\xff\xff\xff\xff\xff\xff\xff\xff", "B\xfd\xff\xff\xff\xff\xff\xff\xff\xff"]
│ │ ├── stats: [rows=7e-07, distinct(3)=1.99999931e-07, null(3)=0, distinct(5)=7e-07, null(5)=0]
│ │ ├── stats: [rows=7e-07, distinct(3)=7e-07, null(3)=0, distinct(5)=7e-07, null(5)=0]
│ │ │ histogram(5)=
│ │ ├── key: (3)
│ │ └── fd: (3)-->(5)
Expand Down Expand Up @@ -402,7 +402,7 @@ project
│ │ ├── columns: rowid:3(int!null) g_inverted_key:5(geometry!null)
│ │ ├── inverted constraint: /5/3
│ │ │ └── spans: ["B\xfd\xff\xff\xff\xff\xff\xff\xff\xff", "B\xfd\xff\xff\xff\xff\xff\xff\xff\xff"]
│ │ ├── stats: [rows=7e-07, distinct(3)=1.99999931e-07, null(3)=0, distinct(5)=7e-07, null(5)=0]
│ │ ├── stats: [rows=7e-07, distinct(3)=7e-07, null(3)=0, distinct(5)=7e-07, null(5)=0]
│ │ │ histogram(5)=
│ │ ├── key: (3)
│ │ └── fd: (3)-->(5)
Expand Down
Loading

0 comments on commit d5d8d51

Please sign in to comment.