opt: reduce statistics allocations for avg size

Prior to the commit, a column's average size in bytes was included in column statistics. To fetch this average size, the coster requested an individual column statistic each scanned column. For scans and joins involving many columns, this caused many allocations of column statistics and column sets. Because we only use a column's average size when costing scans and lookup joins, there was no need to include it in column statistics. Average size doesn't propagate up an expression tree like other statistics do. This commit removes average size from column statistics and instead builds a map in `props.Statistics` that maps column IDs to average size. This significantly reduces allocations in some cases. The only downside to this change is that we no longer set a columns average size to zero if it has all NULL values, according to statistics. I believe this is a pretty rare edge case that is unlikely to significantly affect query plans, so I think the trade-off is worth it. Fixes #80186 Release justification: This is a minor change that improves optimizer performance. Release note: None
cockroachdb · Aug 19, 2022 · 16b38bb · 16b38bb
1 parent f6086c1
commit 16b38bb
Show file tree

Hide file tree

Showing 85 changed files with 1,790 additions and 1,834 deletions.
diff --git a/pkg/sql/opt/memo/memo.go b/pkg/sql/opt/memo/memo.go
@@ -445,6 +445,17 @@ func (m *Memo) RequestColStatTable(
 	return nil, false
 }
 
+// RequestColAvgSize calculates and returns the column's average size statistic.
+// The column must exist in the table with ID tabId.
+func (m *Memo) RequestColAvgSize(tabID opt.TableID, col opt.ColumnID) uint64 {
+	// When SetRoot is called, the statistics builder may have been cleared.
+	// If this happens, we can't serve the request anymore.
+	if m.logPropsBuilder.sb.md != nil {
+		return m.logPropsBuilder.sb.colAvgSize(tabID, col)
+	}
+	return defaultColSize
+}
+
 // RowsProcessed calculates and returns the number of rows processed by the
 // relational expression. It is currently only supported for joins.
 func (m *Memo) RowsProcessed(expr RelExpr) (_ float64, ok bool) {

diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go
@@ -87,9 +87,8 @@ const (
 	// details.
 	multiColWeight = 9.0 / 10.0
 
-	// defaultColSize is the default size of a column in bytes. This is used when
-	// the table statistics have an avgSize of 0 for a given column and not all
-	// columns are NULL.
+	// defaultColSize is the default size of a column in bytes. This is used
+	// when the table statistics have an avgSize of 0 for a given column.
 	defaultColSize = 4.0
 
 	// maxValuesForFullHistogramFromCheckConstraint is the maximum number of
@@ -536,23 +535,6 @@ func (sb *statisticsBuilder) colStatLeaf(
 		}
 		// Only one of the null values counts towards the distinct count.
 		colStat.DistinctCount = s.RowCount - max(colStat.NullCount-1, 0)
-
-		if colSet.Len() == 1 {
-			// If there was only one key in the column set, and it wasn't found in the
-			// cache above, then we don't have statistics on this column so we use the
-			// default size.
-			// TODO(harding): Base the AvgSize on the type of the column.
-			colStat.AvgSize = defaultColSize
-		} else {
-			// Compute the average column size by adding the size of each member of
-			// the lax key together.
-			avgSize := 0.0
-			colSet.ForEach(func(i opt.ColumnID) {
-				colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
-				avgSize += colStatLeaf.AvgSize
-			})
-			colStat.AvgSize = avgSize
-		}
 		return colStat
 	}
 
@@ -562,7 +544,6 @@ func (sb *statisticsBuilder) colStatLeaf(
 		col, _ := colSet.Next(0)
 		colStat.DistinctCount = UnknownDistinctCountRatio * s.RowCount
 		colStat.NullCount = UnknownNullCountRatio * s.RowCount
-		colStat.AvgSize = defaultColSize
 		if notNullCols.Contains(col) {
 			colStat.NullCount = 0
 		}
@@ -579,21 +560,17 @@ func (sb *statisticsBuilder) colStatLeaf(
 	} else {
 		distinctCount := 1.0
 		nullCount := s.RowCount
-		avgSize := 0.0
 		colSet.ForEach(func(i opt.ColumnID) {
 			colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
 			distinctCount *= colStatLeaf.DistinctCount
 			// Multiply by the expected chance of collisions with nulls already
 			// collected.
 			nullCount *= colStatLeaf.NullCount / s.RowCount
-			// Add the average size of the columns together.
-			avgSize += colStatLeaf.AvgSize
 		})
 		// Fetch the colStat again since it may now have a different address.
 		colStat, _ = s.ColStats.Lookup(colSet)
 		colStat.DistinctCount = min(distinctCount, s.RowCount)
 		colStat.NullCount = min(nullCount, s.RowCount)
-		colStat.AvgSize = avgSize
 	}
 
 	return colStat
@@ -662,6 +639,15 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 				cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i)))
 			}
 
+			// We currently only use average column sizes of single column
+			// statistics, so we can ignore multi-column average sizes.
+			if stat.ColumnCount() == 1 && stat.AvgSize() != 0 {
+				if stats.AvgColSizes == nil {
+					stats.AvgColSizes = make(map[opt.ColumnID]uint64)
+				}
+				stats.AvgColSizes[cols.SingleColumn()] = stat.AvgSize()
+			}
+
 			needHistogram := cols.Len() == 1 && stat.Histogram() != nil &&
 				sb.evalCtx.SessionData().OptimizerUseHistograms
 			seenInvertedStat := false
@@ -691,7 +677,6 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 				//    non-inverted histogram that we should be using instead.
 				colStat.DistinctCount = float64(stat.DistinctCount())
 				colStat.NullCount = float64(stat.NullCount())
-				colStat.AvgSize = float64(stat.AvgSize())
 				if needHistogram && !invertedStatistic {
 					// A statistic is inverted if the column is invertible and its
 					// histogram contains buckets of types BYTES.
@@ -732,10 +717,11 @@ func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Stati
 						invColStat.DistinctCount = max(invColStat.Histogram.DistinctValuesCount(), 1)
 						// Inverted indexes don't have nulls.
 						invColStat.NullCount = 0
-						if stat.AvgSize() == 0 {
-							invColStat.AvgSize = defaultColSize
-						} else {
-							invColStat.AvgSize = float64(stat.AvgSize())
+						if stats.AvgColSizes == nil {
+							stats.AvgColSizes = make(map[opt.ColumnID]uint64)
+						}
+						if stat.AvgSize() != 0 {
+							stats.AvgColSizes[invCol] = stat.AvgSize()
 						}
 					}
 				}
@@ -765,6 +751,14 @@ func (sb *statisticsBuilder) colStatTable(
 	return sb.colStatLeaf(colSet, tableStats, tableFD, tableNotNullCols)
 }
 
+func (sb *statisticsBuilder) colAvgSize(tabID opt.TableID, col opt.ColumnID) uint64 {
+	tableStats := sb.makeTableStatistics(tabID)
+	if avgSize, ok := tableStats.AvgColSizes[col]; ok {
+		return avgSize
+	}
+	return defaultColSize
+}
+
 // +------+
 // | Scan |
 // +------+
@@ -1092,7 +1086,6 @@ func (sb *statisticsBuilder) colStatProject(
 		// above.
 		inputColStat := sb.colStatFromChild(reqInputCols, prj, 0 /* childIdx */)
 		colStat.DistinctCount = inputColStat.DistinctCount
-		colStat.AvgSize = inputColStat.AvgSize
 		if nonNullFound {
 			colStat.NullCount = 0
 		} else {
@@ -1101,7 +1094,6 @@ func (sb *statisticsBuilder) colStatProject(
 	} else {
 		// There are no columns in this expression, so it must be a constant.
 		colStat.DistinctCount = 1
-		colStat.AvgSize = float64(defaultColSize * colSet.Len())
 		if nonNullFound {
 			colStat.NullCount = 0
 		} else {
@@ -1530,7 +1522,6 @@ func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props
 			}
 			colStat, _ = s.ColStats.Add(colSet)
 			colStat.DistinctCount = leftColStat.DistinctCount * rightColStat.DistinctCount
-			colStat.AvgSize = leftColStat.AvgSize + rightColStat.AvgSize
 		}
 
 		// Null count estimation - assume an inner join and then bump the null count later
@@ -1709,15 +1700,13 @@ func (sb *statisticsBuilder) colStatIndexJoin(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = 1
 	colStat.NullCount = s.RowCount
-	colStat.AvgSize = 0
 
 	// Some of the requested columns may be from the input index.
 	reqInputCols := colSet.Intersection(inputCols)
 	if !reqInputCols.Empty() {
 		inputColStat := sb.colStatFromChild(reqInputCols, join, 0 /* childIdx */)
 		colStat.DistinctCount = inputColStat.DistinctCount
 		colStat.NullCount = inputColStat.NullCount
-		colStat.AvgSize += inputColStat.AvgSize
 	}
 
 	// Other requested columns may be from the primary index.
@@ -1744,8 +1733,6 @@ func (sb *statisticsBuilder) colStatIndexJoin(
 		f1 := lookupColStat.NullCount / inputStats.RowCount
 		f2 := colStat.NullCount / inputStats.RowCount
 		colStat.NullCount = inputStats.RowCount * f1 * f2
-
-		colStat.AvgSize += lookupColStat.AvgSize
 	}
 
 	if colSet.Intersects(relProps.NotNullCols) {
@@ -1913,7 +1900,6 @@ func (sb *statisticsBuilder) colStatGroupBy(
 		colStat.DistinctCount = 1
 		// TODO(itsbilal): Handle case where the scalar resolves to NULL.
 		colStat.NullCount = 0
-		colStat.AvgSize = float64(defaultColSize * colSet.Len())
 		return colStat
 	}
 
@@ -1927,7 +1913,6 @@ func (sb *statisticsBuilder) colStatGroupBy(
 		colStat, _ = s.ColStats.Add(colSet)
 		inputColStat = sb.colStatFromChild(groupingColSet, groupNode, 0 /* childIdx */)
 		colStat.DistinctCount = inputColStat.DistinctCount
-		colStat.AvgSize = inputColStat.AvgSize
 	} else {
 		// Make a copy so we don't modify the original
 		colStat = sb.copyColStatFromChild(colSet, groupNode, s)
@@ -2026,19 +2011,14 @@ func (sb *statisticsBuilder) colStatSetNodeImpl(
 	case opt.UnionOp, opt.UnionAllOp:
 		colStat.DistinctCount = leftColStat.DistinctCount + rightColStat.DistinctCount
 		colStat.NullCount = leftNullCount + rightNullCount
-		leftRowCount := sb.statsFromChild(setNode, 0 /* childIdx */).RowCount
-		rightRowCount := sb.statsFromChild(setNode, 1 /* childIdx */).RowCount
-		colStat.AvgSize = (leftColStat.AvgSize*leftRowCount + rightColStat.AvgSize*rightRowCount) / (leftRowCount + rightRowCount)
 
 	case opt.IntersectOp, opt.IntersectAllOp:
 		colStat.DistinctCount = min(leftColStat.DistinctCount, rightColStat.DistinctCount)
 		colStat.NullCount = min(leftNullCount, rightNullCount)
-		colStat.AvgSize = leftColStat.AvgSize
 
 	case opt.ExceptOp, opt.ExceptAllOp:
 		colStat.DistinctCount = leftColStat.DistinctCount
 		colStat.NullCount = max(leftNullCount-rightNullCount, 0)
-		colStat.AvgSize = leftColStat.AvgSize
 	}
 
 	// Use the actual null counts for bag operations, and normalize them for set
@@ -2119,9 +2099,6 @@ func (sb *statisticsBuilder) colStatValues(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = float64(len(distinct))
 	colStat.NullCount = float64(nullCount)
-	// TODO(harding): The AvgSize would be more accurate if we took the width and/
-	// or type of the values.
-	colStat.AvgSize = float64(defaultColSize * colSet.Len())
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2165,9 +2142,6 @@ func (sb *statisticsBuilder) colStatLiteralValues(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = float64(len(distinct))
 	colStat.NullCount = float64(nullCount)
-	// TODO(harding): The AvgSize would be more accurate if we took the width and/
-	// or type of the values.
-	colStat.AvgSize = float64(defaultColSize * colSet.Len())
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2318,8 +2292,6 @@ func (sb *statisticsBuilder) colStatMax1Row(
 	if colSet.Intersects(max1Row.Relational().NotNullCols) {
 		colStat.NullCount = 0
 	}
-	inputColStat := sb.colStatFromChild(colSet, max1Row, 0 /* childIdx */)
-	colStat.AvgSize = inputColStat.AvgSize
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2351,7 +2323,6 @@ func (sb *statisticsBuilder) colStatOrdinality(
 	colStat, _ := s.ColStats.Add(colSet)
 
 	inputColStat := sb.colStatFromChild(colSet, ord, 0 /* childIdx */)
-	colStat.AvgSize = inputColStat.AvgSize
 
 	if colSet.Contains(ord.ColID) {
 		// The ordinality column is a key, so every row is distinct.
@@ -2416,20 +2387,16 @@ func (sb *statisticsBuilder) colStatWindow(
 		if colSet.SubsetOf(windowCols) {
 			// The generated columns are the only columns being requested.
 			colStat.NullCount = 0
-			// TODO(harding): make AvgSize more accurate.
-			colStat.AvgSize = float64(defaultColSize * colSet.Len())
 		} else {
-			// Copy NullCount and AvgSize from child.
+			// Copy NullCount from child.
 			colSetChild := colSet.Difference(windowCols)
 			inputColStat := sb.colStatFromChild(colSetChild, window, 0 /* childIdx */)
 			colStat.NullCount = inputColStat.NullCount
-			colStat.AvgSize = inputColStat.AvgSize
 		}
 	} else {
 		inputColStat := sb.colStatFromChild(colSet, window, 0 /* childIdx */)
 		colStat.DistinctCount = inputColStat.DistinctCount
 		colStat.NullCount = inputColStat.NullCount
-		colStat.AvgSize = inputColStat.AvgSize
 	}
 
 	if colSet.Intersects(relProps.NotNullCols) {
@@ -2494,21 +2461,18 @@ func (sb *statisticsBuilder) colStatProjectSet(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = 1
 	colStat.NullCount = s.RowCount
-	colStat.AvgSize = 0
 
 	// Some of the requested columns may be from the input.
 	reqInputCols := colSet.Intersection(inputCols)
 	if !reqInputCols.Empty() {
 		inputColStat := sb.colStatFromChild(reqInputCols, projectSet, 0 /* childIdx */)
 		colStat.DistinctCount = inputColStat.DistinctCount
 		colStat.NullCount = inputColStat.NullCount * (s.RowCount / inputStats.RowCount)
-		colStat.AvgSize += inputColStat.AvgSize
 	}
 
 	// Other requested columns may be from the output columns of the zip.
 	zipCols := projectSet.Zip.OutputCols()
 	reqZipCols := colSet.Difference(inputCols).Intersection(zipCols)
-	colStat.AvgSize += float64(defaultColSize * reqZipCols.Len())
 	if !reqZipCols.Empty() {
 		// Calculate the distinct count and null count for the zip columns
 		// after the cross join has been applied.
@@ -2608,7 +2572,6 @@ func (sb *statisticsBuilder) colStatWithScan(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = inColStat.DistinctCount
 	colStat.NullCount = inColStat.NullCount
-	colStat.AvgSize = inColStat.AvgSize
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2646,7 +2609,6 @@ func (sb *statisticsBuilder) colStatMutation(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = inColStat.DistinctCount
 	colStat.NullCount = inColStat.NullCount
-	colStat.AvgSize = inColStat.AvgSize
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2670,7 +2632,6 @@ func (sb *statisticsBuilder) colStatSequenceSelect(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = 1
 	colStat.NullCount = 0
-	colStat.AvgSize = defaultColSize
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2694,7 +2655,6 @@ func (sb *statisticsBuilder) colStatUnknown(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = s.RowCount
 	colStat.NullCount = 0
-	colStat.AvgSize = float64(defaultColSize * colSet.Len())
 	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
 	return colStat
 }
@@ -2743,7 +2703,6 @@ func (sb *statisticsBuilder) copyColStat(
 	colStat, _ := s.ColStats.Add(colSet)
 	colStat.DistinctCount = inputColStat.DistinctCount
 	colStat.NullCount = inputColStat.NullCount
-	colStat.AvgSize = inputColStat.AvgSize
 	return colStat
 }
 
@@ -2808,12 +2767,6 @@ func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
 	colStat.DistinctCount = min(colStat.DistinctCount, rowCount)
 	colStat.NullCount = min(colStat.NullCount, rowCount)
 
-	// If there are non-nulls in the column but the avgSize is 0, use the default
-	// column size.
-	if rowCount > 0 && colStat.AvgSize == 0 && colStat.NullCount < rowCount {
-		colStat.AvgSize = float64(defaultColSize * colStat.Cols.Len())
-	}
-
 	// Uniformly reduce the size of each histogram bucket so the number of values
 	// is no larger than the row count.
 	if colStat.Histogram != nil {
@@ -3853,7 +3806,6 @@ func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
 	colStat, _ := s.ColStats.Add(multiColSet)
 	colStat.DistinctCount = maxNewDistinct + distinctCountRange*(1-fdStrength)
 	colStat.NullCount = multiColNullCount
-	colStat.AvgSize = inputColStat.AvgSize
 	multiColSelectivity := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
 
 	// multiColSelectivity must be at least as large as singleColSelectivity,
@@ -4745,7 +4697,6 @@ func (sb *statisticsBuilder) buildStatsFromCheckConstraints(
 			// up via a ColSet.
 			colStat.DistinctCount = distinctCount
 			colStat.NullCount = nullCount
-			colStat.AvgSize = avgSize
 			if useHistogram {
 				colStat.Histogram = &props.Histogram{}
 				colStat.Histogram.Init(sb.evalCtx, firstColID, histogram)