sql/stats: support rowCountEq = 0 in histogram.adjustCounts

The predicted histograms in statistics forecasts will often have buckets with NumEq = 0, and some predicted histograms will have _all_ buckets with NumEq = 0. This wasn't possible before forecasting, because the histograms produced by `EquiDepthHistogram` never have any buckets with NumEq = 0. If `adjustCounts` is called on such a histogram, `rowCountEq` and `distinctCountEq` will be zero. `adjustCounts` should still be able to fix such a histogram to have sum(NumRange) = rowCountTotal and sum(DistinctRange) = distinctCountTotal. This patch teaches `adjustCounts` to handle these histograms. (Similarly, predicted histograms could have all buckets with NumRange = 0, but this is already possible for histograms produced by `EquiDepthHistogram`, so `adjustCounts` already handles these.) Also, add a few more comments to `adjustCounts`. Assists: #79872 Release note: None
cockroachdb · Jun 6, 2022 · 5e5edf5 · 5e5edf5
1 parent 2fc45fc
commit 5e5edf5
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 8 deletions.
diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go
@@ -170,8 +170,16 @@ type histogram struct {
 }
 
 // adjustCounts adjusts the row count and number of distinct values per bucket
-// based on the total row count and estimated distinct count.
+// to equal the total row count and estimated distinct count. The total row
+// count and estimated distinct count should not include NULL values, and the
+// histogram should not contain any buckets for NULL values.
 func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctCountTotal float64) {
+	// Empty table cases.
+	if rowCountTotal <= 0 || distinctCountTotal <= 0 {
+		h.buckets = make([]cat.HistogramBucket, 0)
+		return
+	}
+
 	// Calculate the current state of the histogram so we can adjust it as needed.
 	// The number of rows and distinct values represented by the histogram should
 	// be adjusted so they equal rowCountTotal and distinctCountTotal.
@@ -189,13 +197,16 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
 		}
 	}
 
-	if rowCountEq <= 0 {
-		panic(errors.AssertionFailedf("expected a positive value for rowCountEq"))
+	// If the histogram only had empty buckets, we can't adjust it.
+	if rowCountRange+rowCountEq <= 0 || distinctCountRange+distinctCountEq <= 0 {
+		h.buckets = make([]cat.HistogramBucket, 0)
+		return
 	}
 
 	// If the upper bounds account for all distinct values (as estimated by the
 	// sketch), make the histogram consistent by clearing the ranges and adjusting
-	// the NumEq values to add up to the row count.
+	// the NumEq values to add up to the row count. This might be the case for
+	// low-cardinality types like BOOL and ENUM or other low-cardinality data.
 	if distinctCountEq >= distinctCountTotal {
 		adjustmentFactorNumEq := rowCountTotal / rowCountEq
 		for i := range h.buckets {
@@ -209,7 +220,7 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
 	// The upper bounds do not account for all distinct values, so adjust the
 	// NumEq values if needed so they add up to less than the row count.
 	remDistinctCount := distinctCountTotal - distinctCountEq
-	if rowCountEq+remDistinctCount >= rowCountTotal {
+	if rowCountEq > 0 && rowCountEq+remDistinctCount > rowCountTotal {
 		targetRowCountEq := rowCountTotal - remDistinctCount
 		adjustmentFactorNumEq := targetRowCountEq / rowCountEq
 		for i := range h.buckets {
@@ -229,10 +240,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
 		lowerBound := h.buckets[0].UpperBound
 		upperBound := h.buckets[len(h.buckets)-1].UpperBound
 		if maxDistinct, ok := tree.MaxDistinctCount(evalCtx, lowerBound, upperBound); ok {
-			// Subtract distinctCountEq to account for the upper bounds of the
+			// Subtract number of buckets to account for the upper bounds of the
 			// buckets, along with the current range distinct count which has already
 			// been accounted for.
-			maxDistinctCountRange = float64(maxDistinct) - distinctCountEq - distinctCountRange
+			maxDistinctCountRange = float64(maxDistinct) - float64(len(h.buckets)) - distinctCountRange
 		}
 
 		// Add distinct values into the histogram if there is space. Increment the
@@ -277,7 +288,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC
 		)
 	}
 
-	// Adjust the values so the row counts and distinct counts add up correctly.
+	// At this point rowCountRange + rowCountEq >= distinctCountTotal but not
+	// necessarily rowCountTotal, so we've accounted for all distinct values, and
+	// any additional rows we add will be duplicate values. We can spread the
+	// final adjustment proportionately across both NumRange and NumEq.
 	adjustmentFactorDistinctRange := float64(1)
 	if distinctCountRange > 0 {
 		adjustmentFactorDistinctRange = (distinctCountTotal - distinctCountEq) / distinctCountRange

diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go
@@ -524,6 +524,42 @@ func TestAdjustCounts(t *testing.T) {
 				{NumRange: 1551.19, NumEq: 3447.09, DistinctRange: 450, UpperBound: f(1000)},
 			},
 		},
+		{ // Zero rowCount and distinctCount.
+			h: []cat.HistogramBucket{
+				{NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)},
+			},
+			rowCount:      0,
+			distinctCount: 0,
+			expected:      []cat.HistogramBucket{},
+		},
+		{ // Empty initial histogram.
+			h:             []cat.HistogramBucket{},
+			rowCount:      1000,
+			distinctCount: 1000,
+			expected:      []cat.HistogramBucket{},
+		},
+		{ // Empty bucket in initial histogram.
+			h: []cat.HistogramBucket{
+				{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
+			},
+			rowCount:      99,
+			distinctCount: 99,
+			expected:      []cat.HistogramBucket{},
+		},
+		{ // All zero NumEq.
+			h: []cat.HistogramBucket{
+				{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
+				{NumRange: 10, NumEq: 0, DistinctRange: 5, UpperBound: f(100)},
+				{NumRange: 10, NumEq: 0, DistinctRange: 10, UpperBound: f(200)},
+			},
+			rowCount:      100,
+			distinctCount: 60,
+			expected: []cat.HistogramBucket{
+				{NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)},
+				{NumRange: 50, NumEq: 0, DistinctRange: 27.5, UpperBound: f(100)},
+				{NumRange: 50, NumEq: 0, DistinctRange: 32.5, UpperBound: f(200)},
+			},
+		},
 	}
 
 	evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())