Skip to content

Commit

Permalink
opt: improve variable names in selectivityFromMultiColDistinctCounts
Browse files Browse the repository at this point in the history
This commit improves the variable names in
selectivityFromMultiColDistinctCounts in statisticsBuilder to be more
self-documenting.

Release note: None
  • Loading branch information
rytaft committed Apr 26, 2024
1 parent 7e1397e commit 30e7aab
Showing 1 changed file with 55 additions and 48 deletions.
103 changes: 55 additions & 48 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3748,13 +3748,13 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
// 3 or 4. We estimate the new distinct count as follows, using the concept
// of "soft functional dependency (FD) strength" as defined in [1]:
//
// new_distinct({x,y}) = min_value + range * (1 - FD_strength_scaled)
// new_distinct({x,y}) = min_distinct + distinct_range * (1 - FD_strength_scaled)
//
// where
//
// min_value = max(new_distinct(x), new_distinct(y))
// max_value = new_distinct(x) * new_distinct(y)
// range = max_value - min_value
// min_distinct = max(new_distinct(x), new_distinct(y))
// max_distinct = new_distinct(x) * new_distinct(y)
// distinct_range = max_distinct - min_distinct
//
// ⎛ max(old_distinct(x),old_distinct(y)) ⎞
// FD_strength = ⎜ ------------------------------------ ⎟
Expand Down Expand Up @@ -3802,7 +3802,7 @@ func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
// completely correlated. It is equal to the minimum selectivity of any single
// column. selectivityLowerBound would be the value of equation (2) if the
// columns were completely independent. It is the minimum new distinct count
// (min_value) divided by the product of the old single column distinct counts
// (min_distinct) divided by the product of the old single column distinct counts
// (or the old row count, whichever is smaller). These values will be used later
// to measure the level of correlation.
//
Expand All @@ -3823,81 +3823,87 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(

// First calculate the selectivity from equation (1) (see function comment),
// and collect the inputs to equation (2).
singleColSelectivity := props.OneSelectivity
newDistinctProduct, oldDistinctProduct := 1.0, 1.0
maxNewDistinct, maxOldDistinct := float64(0), float64(0)
multiColSelWithIndepAssumption := props.OneSelectivity
newSingleColDistinctProduct, oldSingleColDistinctProduct := 1.0, 1.0
maxNewSingleColDistinct, maxOldSingleColDistinct := float64(0), float64(0)
multiColNullCount := -1.0
minLocalSel := props.OneSelectivity
minSingleColSel := props.OneSelectivity
for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
singleColStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
if !ok {
multiColSet.Remove(col)
continue
}

inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
localSel := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
singleColSelectivity.Multiply(localSel)
inputSingleColStat, inputStats := sb.colStatFromInput(singleColStat.Cols, e)
singleColSel := sb.selectivityFromDistinctCount(singleColStat, inputSingleColStat, inputStats.RowCount)
multiColSelWithIndepAssumption.Multiply(singleColSel)

// Don't bother including columns in the multi-column calculation that
// don't contribute to the selectivity.
if localSel == props.OneSelectivity {
if singleColSel == props.OneSelectivity {
multiColSet.Remove(col)
continue
}

// Calculate values needed for the multi-column stats calculation below.
newDistinctProduct *= colStat.DistinctCount
oldDistinctProduct *= inputColStat.DistinctCount
if colStat.DistinctCount > maxNewDistinct {
maxNewDistinct = colStat.DistinctCount
newSingleColDistinctProduct *= singleColStat.DistinctCount
oldSingleColDistinctProduct *= inputSingleColStat.DistinctCount
if singleColStat.DistinctCount > maxNewSingleColDistinct {
maxNewSingleColDistinct = singleColStat.DistinctCount
}
if inputColStat.DistinctCount > maxOldDistinct {
maxOldDistinct = inputColStat.DistinctCount
if inputSingleColStat.DistinctCount > maxOldSingleColDistinct {
maxOldSingleColDistinct = inputSingleColStat.DistinctCount
}
minLocalSel = props.MinSelectivity(localSel, minLocalSel)
minSingleColSel = props.MinSelectivity(singleColSel, minSingleColSel)
if multiColNullCount < 0 {
multiColNullCount = inputStats.RowCount
}
// Multiply by the expected chance of collisions with nulls already
// collected.
multiColNullCount *= colStat.NullCount / inputStats.RowCount
multiColNullCount *= singleColStat.NullCount / inputStats.RowCount
}

// If we don't need to use a multi-column statistic, we're done.
if multiColSet.Len() <= 1 {
return singleColSelectivity, minLocalSel, singleColSelectivity
return multiColSelWithIndepAssumption, minSingleColSel, multiColSelWithIndepAssumption
}

// Otherwise, calculate the selectivity using multi-column stats from
// equation (2). See the comment above the function definition for details
// about the formula.
inputColStat, inputStats := sb.colStatFromInput(multiColSet, e)
fdStrength := min(maxOldDistinct/inputColStat.DistinctCount, 1.0)
minFdStrength := min(maxOldDistinct/oldDistinctProduct, fdStrength)
inputMultiColStat, inputStats := sb.colStatFromInput(multiColSet, e)
fdStrength := min(maxOldSingleColDistinct/inputMultiColStat.DistinctCount, 1.0)
minFdStrength := min(maxOldSingleColDistinct/oldSingleColDistinctProduct, fdStrength)
if minFdStrength < 1 {
// Scale the fdStrength so it ranges between 0 and 1.
fdStrength = (fdStrength - minFdStrength) / (1 - minFdStrength)
}
distinctCountRange := max(newDistinctProduct-maxNewDistinct, 0)

colStat, _ := s.ColStats.Add(multiColSet)
colStat.DistinctCount = maxNewDistinct + distinctCountRange*(1-fdStrength)
colStat.NullCount = multiColNullCount
multiColSelectivity := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
// These variables correspond to min_distinct, max_distinct, and distinct_range
// in the comment above the function definition.
minNewMultiColDistinct := maxNewSingleColDistinct
maxNewMultiColDistinct := newSingleColDistinctProduct
multiColDistinctRange := max(maxNewMultiColDistinct-minNewMultiColDistinct, 0)

multiColStat, _ := s.ColStats.Add(multiColSet)
multiColStat.DistinctCount = minNewMultiColDistinct + multiColDistinctRange*(1-fdStrength)
multiColStat.NullCount = multiColNullCount
multiColSelectivity := sb.selectivityFromDistinctCount(multiColStat, inputMultiColStat, inputStats.RowCount)

// multiColSelectivity must be at least as large as singleColSelectivity,
// since singleColSelectivity corresponds to equation (1).
multiColSelectivity = props.MaxSelectivity(multiColSelectivity, singleColSelectivity)
// multiColSelectivity must be at least as large as
// multiColSelWithIndepAssumption, since multiColSelWithIndepAssumption
// corresponds to equation (1).
multiColSelectivity = props.MaxSelectivity(multiColSelectivity, multiColSelWithIndepAssumption)

// Now, we must adjust multiColSelectivity so that it is not greater than
// the selectivity of any subset of the columns in multiColSet. This would
// be internally inconsistent and could lead to bad plans. For example,
// x=1 AND y=1 should always be considered more selective (i.e., with lower
// selectivity) than x=1 alone.
//
// We have already found the minimum selectivity of all the individual
// columns (subsets of size 1) above and stored it in minLocalSel. It's not
// We have already found the minimum selectivity of all the individual columns
// (subsets of size 1) above and stored it in minSingleColSel. It's not
// practical, however, to calculate the minimum selectivity for all subsets
// larger than size 1.
//
Expand All @@ -3909,7 +3915,7 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
//
// In this case, adjust multiColSelectivity as needed.
//
if maxNewDistinct > 1 && multiColSet.Len() > 2 {
if maxNewSingleColDistinct > 1 && multiColSet.Len() > 2 {
var lowDistinctCountCols opt.ColSet
multiColSet.ForEach(func(col opt.ColumnID) {
// We already know the column stat exists if it's in multiColSet.
Expand All @@ -3935,34 +3941,35 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
multiColSelectivity = props.MinSelectivity(multiColSelectivity, selLowDistinctCountCols)
}
}
multiColSelectivity = props.MinSelectivity(multiColSelectivity, minLocalSel)
multiColSelectivity = props.MinSelectivity(multiColSelectivity, minSingleColSel)

// multiColSelectivityLowerBound is the minimum multi-column selectivity, which
// is the minimum possible new multi-col distinct count divided by the maximum
// old multi-column distinct count (either the product of the old single column
// distinct counts or the old row count, whichever is smaller).
maxOldMultiColDistinct := min(inputStats.RowCount, oldSingleColDistinctProduct)
multiColSelectivityLowerBound := props.MakeSelectivityFromFraction(
maxNewDistinct, min(inputStats.RowCount, oldDistinctProduct),
minNewMultiColDistinct, maxOldMultiColDistinct,
)
// Ensure that multiColSelectivityLowerBound is not larger than
// multiColSelectivity.
multiColSelectivityLowerBound = props.MinSelectivity(
multiColSelectivityLowerBound, multiColSelectivity,
)

// As described in the function comment, we actually return a weighted sum
// of multi-column and single-column selectivity estimates. To ensure
// selectivityLowerBound is not larger than this selectivity, use the same
// weighting scheme.
// As described in the function comment, we actually return a weighted sum of
// multi-column selectivity estimates with and without an independence
// assumption. To ensure selectivityLowerBound is not larger than this
// selectivity, use the same weighting scheme.
//
// Use MaxSelectivity to handle floating point rounding errors.
w := multiColWeight
return props.MaxSelectivity(singleColSelectivity, props.MakeSelectivity(
(1-w)*singleColSelectivity.AsFloat()+w*multiColSelectivity.AsFloat(),
return props.MaxSelectivity(multiColSelWithIndepAssumption, props.MakeSelectivity(
(1-w)*multiColSelWithIndepAssumption.AsFloat()+w*multiColSelectivity.AsFloat(),
)),
minLocalSel,
props.MaxSelectivity(singleColSelectivity, props.MakeSelectivity(
(1-w)*singleColSelectivity.AsFloat()+w*multiColSelectivityLowerBound.AsFloat(),
minSingleColSel,
props.MaxSelectivity(multiColSelWithIndepAssumption, props.MakeSelectivity(
(1-w)*multiColSelWithIndepAssumption.AsFloat()+w*multiColSelectivityLowerBound.AsFloat(),
))
}

Expand Down

0 comments on commit 30e7aab

Please sign in to comment.