Skip to content

Commit

Permalink
opt: reduce statistics allocations for avg size
Browse files Browse the repository at this point in the history
Prior to the commit, a column's average size in bytes was included in
column statistics. To fetch this average size, the coster requested an
individual column statistic each scanned column. For scans and joins
involving many columns, this caused many allocations of column
statistics and column sets.

Because we only use a column's average size when costing scans and
lookup joins, there was no need to include it in column statistics.
Average size doesn't propagate up an expression tree like other
statistics do.

This commit removes average size from column statistics and instead
builds a map in `props.Statistics` that maps column IDs to average size.
This significantly reduces allocations in some cases.

The only downside to this change is that we no longer set a columns
average size to zero if it has all NULL values, according to statistics.
I believe this is a pretty rare edge case that is unlikely to
significantly affect query plans, so I think the trade-off is worth it.

Fixes #80186

Release justification: This is a minor change that improves optimizer
performance.

Release note: None
  • Loading branch information
mgartner committed Aug 19, 2022
1 parent f6086c1 commit 16b38bb
Show file tree
Hide file tree
Showing 85 changed files with 1,790 additions and 1,834 deletions.
11 changes: 11 additions & 0 deletions pkg/sql/opt/memo/memo.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,17 @@ func (m *Memo) RequestColStatTable(
return nil, false
}

// RequestColAvgSize calculates and returns the column's average size statistic.
// The column must exist in the table with ID tabId.
func (m *Memo) RequestColAvgSize(tabID opt.TableID, col opt.ColumnID) uint64 {
// When SetRoot is called, the statistics builder may have been cleared.
// If this happens, we can't serve the request anymore.
if m.logPropsBuilder.sb.md != nil {
return m.logPropsBuilder.sb.colAvgSize(tabID, col)
}
return defaultColSize
}

// RowsProcessed calculates and returns the number of rows processed by the
// relational expression. It is currently only supported for joins.
func (m *Memo) RowsProcessed(expr RelExpr) (_ float64, ok bool) {
Expand Down
99 changes: 25 additions & 74 deletions pkg/sql/opt/memo/statistics_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,8 @@ const (
// details.
multiColWeight = 9.0 / 10.0

// defaultColSize is the default size of a column in bytes. This is used when
// the table statistics have an avgSize of 0 for a given column and not all
// columns are NULL.
// defaultColSize is the default size of a column in bytes. This is used
// when the table statistics have an avgSize of 0 for a given column.
defaultColSize = 4.0

// maxValuesForFullHistogramFromCheckConstraint is the maximum number of
Expand Down Expand Up @@ -536,23 +535,6 @@ func (sb *statisticsBuilder) colStatLeaf(
}
// Only one of the null values counts towards the distinct count.
colStat.DistinctCount = s.RowCount - max(colStat.NullCount-1, 0)

if colSet.Len() == 1 {
// If there was only one key in the column set, and it wasn't found in the
// cache above, then we don't have statistics on this column so we use the
// default size.
// TODO(harding): Base the AvgSize on the type of the column.
colStat.AvgSize = defaultColSize
} else {
// Compute the average column size by adding the size of each member of
// the lax key together.
avgSize := 0.0
colSet.ForEach(func(i opt.ColumnID) {
colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
avgSize += colStatLeaf.AvgSize
})
colStat.AvgSize = avgSize
}
return colStat
}

Expand All @@ -562,7 +544,6 @@ func (sb *statisticsBuilder) colStatLeaf(
col, _ := colSet.Next(0)
colStat.DistinctCount = UnknownDistinctCountRatio * s.RowCount
colStat.NullCount = UnknownNullCountRatio * s.RowCount
colStat.AvgSize = defaultColSize
if notNullCols.Contains(col) {
colStat.NullCount = 0
}
Expand All @@ -579,21 +560,17 @@ func (sb *statisticsBuilder) colStatLeaf(
} else {
distinctCount := 1.0
nullCount := s.RowCount
avgSize := 0.0
colSet.ForEach(func(i opt.ColumnID) {
colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
distinctCount *= colStatLeaf.DistinctCount
// Multiply by the expected chance of collisions with nulls already
// collected.
nullCount *= colStatLeaf.NullCount / s.RowCount
// Add the average size of the columns together.
avgSize += colStatLeaf.AvgSize
})
// Fetch the colStat again since it may now have a different address.
colStat, _ = s.ColStats.Lookup(colSet)
colStat.DistinctCount = min(distinctCount, s.RowCount)
colStat.NullCount = min(nullCount, s.RowCount)
colStat.AvgSize = avgSize
}

return colStat
Expand Down Expand Up @@ -662,6 +639,15 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i)))
}

// We currently only use average column sizes of single column
// statistics, so we can ignore multi-column average sizes.
if stat.ColumnCount() == 1 && stat.AvgSize() != 0 {
if stats.AvgColSizes == nil {
stats.AvgColSizes = make(map[opt.ColumnID]uint64)
}
stats.AvgColSizes[cols.SingleColumn()] = stat.AvgSize()
}

needHistogram := cols.Len() == 1 && stat.Histogram() != nil &&
sb.evalCtx.SessionData().OptimizerUseHistograms
seenInvertedStat := false
Expand Down Expand Up @@ -691,7 +677,6 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
// non-inverted histogram that we should be using instead.
colStat.DistinctCount = float64(stat.DistinctCount())
colStat.NullCount = float64(stat.NullCount())
colStat.AvgSize = float64(stat.AvgSize())
if needHistogram && !invertedStatistic {
// A statistic is inverted if the column is invertible and its
// histogram contains buckets of types BYTES.
Expand Down Expand Up @@ -732,10 +717,11 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
invColStat.DistinctCount = max(invColStat.Histogram.DistinctValuesCount(), 1)
// Inverted indexes don't have nulls.
invColStat.NullCount = 0
if stat.AvgSize() == 0 {
invColStat.AvgSize = defaultColSize
} else {
invColStat.AvgSize = float64(stat.AvgSize())
if stats.AvgColSizes == nil {
stats.AvgColSizes = make(map[opt.ColumnID]uint64)
}
if stat.AvgSize() != 0 {
stats.AvgColSizes[invCol] = stat.AvgSize()
}
}
}
Expand Down Expand Up @@ -765,6 +751,14 @@ func (sb *statisticsBuilder) colStatTable(
return sb.colStatLeaf(colSet, tableStats, tableFD, tableNotNullCols)
}

func (sb *statisticsBuilder) colAvgSize(tabID opt.TableID, col opt.ColumnID) uint64 {
tableStats := sb.makeTableStatistics(tabID)
if avgSize, ok := tableStats.AvgColSizes[col]; ok {
return avgSize
}
return defaultColSize
}

// +------+
// | Scan |
// +------+
Expand Down Expand Up @@ -1092,7 +1086,6 @@ func (sb *statisticsBuilder) colStatProject(
// above.
inputColStat := sb.colStatFromChild(reqInputCols, prj, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.AvgSize = inputColStat.AvgSize
if nonNullFound {
colStat.NullCount = 0
} else {
Expand All @@ -1101,7 +1094,6 @@ func (sb *statisticsBuilder) colStatProject(
} else {
// There are no columns in this expression, so it must be a constant.
colStat.DistinctCount = 1
colStat.AvgSize = float64(defaultColSize * colSet.Len())
if nonNullFound {
colStat.NullCount = 0
} else {
Expand Down Expand Up @@ -1530,7 +1522,6 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
}
colStat, _ = s.ColStats.Add(colSet)
colStat.DistinctCount = leftColStat.DistinctCount * rightColStat.DistinctCount
colStat.AvgSize = leftColStat.AvgSize + rightColStat.AvgSize
}

// Null count estimation - assume an inner join and then bump the null count later
Expand Down Expand Up @@ -1709,15 +1700,13 @@ func (sb *statisticsBuilder) colStatIndexJoin(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = 1
colStat.NullCount = s.RowCount
colStat.AvgSize = 0

// Some of the requested columns may be from the input index.
reqInputCols := colSet.Intersection(inputCols)
if !reqInputCols.Empty() {
inputColStat := sb.colStatFromChild(reqInputCols, join, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.NullCount = inputColStat.NullCount
colStat.AvgSize += inputColStat.AvgSize
}

// Other requested columns may be from the primary index.
Expand All @@ -1744,8 +1733,6 @@ func (sb *statisticsBuilder) colStatIndexJoin(
f1 := lookupColStat.NullCount / inputStats.RowCount
f2 := colStat.NullCount / inputStats.RowCount
colStat.NullCount = inputStats.RowCount * f1 * f2

colStat.AvgSize += lookupColStat.AvgSize
}

if colSet.Intersects(relProps.NotNullCols) {
Expand Down Expand Up @@ -1913,7 +1900,6 @@ func (sb *statisticsBuilder) colStatGroupBy(
colStat.DistinctCount = 1
// TODO(itsbilal): Handle case where the scalar resolves to NULL.
colStat.NullCount = 0
colStat.AvgSize = float64(defaultColSize * colSet.Len())
return colStat
}

Expand All @@ -1927,7 +1913,6 @@ func (sb *statisticsBuilder) colStatGroupBy(
colStat, _ = s.ColStats.Add(colSet)
inputColStat = sb.colStatFromChild(groupingColSet, groupNode, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.AvgSize = inputColStat.AvgSize
} else {
// Make a copy so we don't modify the original
colStat = sb.copyColStatFromChild(colSet, groupNode, s)
Expand Down Expand Up @@ -2026,19 +2011,14 @@ func (sb *statisticsBuilder) colStatSetNodeImpl(
case opt.UnionOp, opt.UnionAllOp:
colStat.DistinctCount = leftColStat.DistinctCount + rightColStat.DistinctCount
colStat.NullCount = leftNullCount + rightNullCount
leftRowCount := sb.statsFromChild(setNode, 0 /* childIdx */).RowCount
rightRowCount := sb.statsFromChild(setNode, 1 /* childIdx */).RowCount
colStat.AvgSize = (leftColStat.AvgSize*leftRowCount + rightColStat.AvgSize*rightRowCount) / (leftRowCount + rightRowCount)

case opt.IntersectOp, opt.IntersectAllOp:
colStat.DistinctCount = min(leftColStat.DistinctCount, rightColStat.DistinctCount)
colStat.NullCount = min(leftNullCount, rightNullCount)
colStat.AvgSize = leftColStat.AvgSize

case opt.ExceptOp, opt.ExceptAllOp:
colStat.DistinctCount = leftColStat.DistinctCount
colStat.NullCount = max(leftNullCount-rightNullCount, 0)
colStat.AvgSize = leftColStat.AvgSize
}

// Use the actual null counts for bag operations, and normalize them for set
Expand Down Expand Up @@ -2119,9 +2099,6 @@ func (sb *statisticsBuilder) colStatValues(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = float64(len(distinct))
colStat.NullCount = float64(nullCount)
// TODO(harding): The AvgSize would be more accurate if we took the width and/
// or type of the values.
colStat.AvgSize = float64(defaultColSize * colSet.Len())
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand Down Expand Up @@ -2165,9 +2142,6 @@ func (sb *statisticsBuilder) colStatLiteralValues(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = float64(len(distinct))
colStat.NullCount = float64(nullCount)
// TODO(harding): The AvgSize would be more accurate if we took the width and/
// or type of the values.
colStat.AvgSize = float64(defaultColSize * colSet.Len())
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand Down Expand Up @@ -2318,8 +2292,6 @@ func (sb *statisticsBuilder) colStatMax1Row(
if colSet.Intersects(max1Row.Relational().NotNullCols) {
colStat.NullCount = 0
}
inputColStat := sb.colStatFromChild(colSet, max1Row, 0 /* childIdx */)
colStat.AvgSize = inputColStat.AvgSize
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand Down Expand Up @@ -2351,7 +2323,6 @@ func (sb *statisticsBuilder) colStatOrdinality(
colStat, _ := s.ColStats.Add(colSet)

inputColStat := sb.colStatFromChild(colSet, ord, 0 /* childIdx */)
colStat.AvgSize = inputColStat.AvgSize

if colSet.Contains(ord.ColID) {
// The ordinality column is a key, so every row is distinct.
Expand Down Expand Up @@ -2416,20 +2387,16 @@ func (sb *statisticsBuilder) colStatWindow(
if colSet.SubsetOf(windowCols) {
// The generated columns are the only columns being requested.
colStat.NullCount = 0
// TODO(harding): make AvgSize more accurate.
colStat.AvgSize = float64(defaultColSize * colSet.Len())
} else {
// Copy NullCount and AvgSize from child.
// Copy NullCount from child.
colSetChild := colSet.Difference(windowCols)
inputColStat := sb.colStatFromChild(colSetChild, window, 0 /* childIdx */)
colStat.NullCount = inputColStat.NullCount
colStat.AvgSize = inputColStat.AvgSize
}
} else {
inputColStat := sb.colStatFromChild(colSet, window, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.NullCount = inputColStat.NullCount
colStat.AvgSize = inputColStat.AvgSize
}

if colSet.Intersects(relProps.NotNullCols) {
Expand Down Expand Up @@ -2494,21 +2461,18 @@ func (sb *statisticsBuilder) colStatProjectSet(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = 1
colStat.NullCount = s.RowCount
colStat.AvgSize = 0

// Some of the requested columns may be from the input.
reqInputCols := colSet.Intersection(inputCols)
if !reqInputCols.Empty() {
inputColStat := sb.colStatFromChild(reqInputCols, projectSet, 0 /* childIdx */)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.NullCount = inputColStat.NullCount * (s.RowCount / inputStats.RowCount)
colStat.AvgSize += inputColStat.AvgSize
}

// Other requested columns may be from the output columns of the zip.
zipCols := projectSet.Zip.OutputCols()
reqZipCols := colSet.Difference(inputCols).Intersection(zipCols)
colStat.AvgSize += float64(defaultColSize * reqZipCols.Len())
if !reqZipCols.Empty() {
// Calculate the distinct count and null count for the zip columns
// after the cross join has been applied.
Expand Down Expand Up @@ -2608,7 +2572,6 @@ func (sb *statisticsBuilder) colStatWithScan(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = inColStat.DistinctCount
colStat.NullCount = inColStat.NullCount
colStat.AvgSize = inColStat.AvgSize
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand Down Expand Up @@ -2646,7 +2609,6 @@ func (sb *statisticsBuilder) colStatMutation(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = inColStat.DistinctCount
colStat.NullCount = inColStat.NullCount
colStat.AvgSize = inColStat.AvgSize
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand All @@ -2670,7 +2632,6 @@ func (sb *statisticsBuilder) colStatSequenceSelect(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = 1
colStat.NullCount = 0
colStat.AvgSize = defaultColSize
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand All @@ -2694,7 +2655,6 @@ func (sb *statisticsBuilder) colStatUnknown(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = s.RowCount
colStat.NullCount = 0
colStat.AvgSize = float64(defaultColSize * colSet.Len())
sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
return colStat
}
Expand Down Expand Up @@ -2743,7 +2703,6 @@ func (sb *statisticsBuilder) copyColStat(
colStat, _ := s.ColStats.Add(colSet)
colStat.DistinctCount = inputColStat.DistinctCount
colStat.NullCount = inputColStat.NullCount
colStat.AvgSize = inputColStat.AvgSize
return colStat
}

Expand Down Expand Up @@ -2808,12 +2767,6 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
colStat.DistinctCount = min(colStat.DistinctCount, rowCount)
colStat.NullCount = min(colStat.NullCount, rowCount)

// If there are non-nulls in the column but the avgSize is 0, use the default
// column size.
if rowCount > 0 && colStat.AvgSize == 0 && colStat.NullCount < rowCount {
colStat.AvgSize = float64(defaultColSize * colStat.Cols.Len())
}

// Uniformly reduce the size of each histogram bucket so the number of values
// is no larger than the row count.
if colStat.Histogram != nil {
Expand Down Expand Up @@ -3853,7 +3806,6 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
colStat, _ := s.ColStats.Add(multiColSet)
colStat.DistinctCount = maxNewDistinct + distinctCountRange*(1-fdStrength)
colStat.NullCount = multiColNullCount
colStat.AvgSize = inputColStat.AvgSize
multiColSelectivity := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)

// multiColSelectivity must be at least as large as singleColSelectivity,
Expand Down Expand Up @@ -4745,7 +4697,6 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
// up via a ColSet.
colStat.DistinctCount = distinctCount
colStat.NullCount = nullCount
colStat.AvgSize = avgSize
if useHistogram {
colStat.Histogram = &props.Histogram{}
colStat.Histogram.Init(sb.evalCtx, firstColID, histogram)
Expand Down
Loading

0 comments on commit 16b38bb

Please sign in to comment.