diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go index e5156c00f7ca..a920218eb25e 100644 --- a/pkg/sql/stats/histogram.go +++ b/pkg/sql/stats/histogram.go @@ -170,8 +170,16 @@ type histogram struct { } // adjustCounts adjusts the row count and number of distinct values per bucket -// based on the total row count and estimated distinct count. +// to equal the total row count and estimated distinct count. The total row +// count and estimated distinct count should not include NULL values, and the +// histogram should not contain any buckets for NULL values. func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctCountTotal float64) { + // Empty table cases. + if rowCountTotal <= 0 || distinctCountTotal <= 0 { + h.buckets = make([]cat.HistogramBucket, 0) + return + } + // Calculate the current state of the histogram so we can adjust it as needed. // The number of rows and distinct values represented by the histogram should // be adjusted so they equal rowCountTotal and distinctCountTotal. @@ -189,13 +197,16 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC } } - if rowCountEq <= 0 { - panic(errors.AssertionFailedf("expected a positive value for rowCountEq")) + // If the histogram only had empty buckets, we can't adjust it. + if rowCountRange+rowCountEq <= 0 || distinctCountRange+distinctCountEq <= 0 { + h.buckets = make([]cat.HistogramBucket, 0) + return } // If the upper bounds account for all distinct values (as estimated by the // sketch), make the histogram consistent by clearing the ranges and adjusting - // the NumEq values to add up to the row count. + // the NumEq values to add up to the row count. This might be the case for + // low-cardinality types like BOOL and ENUM or other low-cardinality data. if distinctCountEq >= distinctCountTotal { adjustmentFactorNumEq := rowCountTotal / rowCountEq for i := range h.buckets { @@ -209,7 +220,7 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC // The upper bounds do not account for all distinct values, so adjust the // NumEq values if needed so they add up to less than the row count. remDistinctCount := distinctCountTotal - distinctCountEq - if rowCountEq+remDistinctCount >= rowCountTotal { + if rowCountEq > 0 && rowCountEq+remDistinctCount > rowCountTotal { targetRowCountEq := rowCountTotal - remDistinctCount adjustmentFactorNumEq := targetRowCountEq / rowCountEq for i := range h.buckets { @@ -229,10 +240,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC lowerBound := h.buckets[0].UpperBound upperBound := h.buckets[len(h.buckets)-1].UpperBound if maxDistinct, ok := tree.MaxDistinctCount(evalCtx, lowerBound, upperBound); ok { - // Subtract distinctCountEq to account for the upper bounds of the + // Subtract number of buckets to account for the upper bounds of the // buckets, along with the current range distinct count which has already // been accounted for. - maxDistinctCountRange = float64(maxDistinct) - distinctCountEq - distinctCountRange + maxDistinctCountRange = float64(maxDistinct) - float64(len(h.buckets)) - distinctCountRange } // Add distinct values into the histogram if there is space. Increment the @@ -277,7 +288,10 @@ func (h *histogram) adjustCounts(evalCtx *eval.Context, rowCountTotal, distinctC ) } - // Adjust the values so the row counts and distinct counts add up correctly. + // At this point rowCountRange + rowCountEq >= distinctCountTotal but not + // necessarily rowCountTotal, so we've accounted for all distinct values, and + // any additional rows we add will be duplicate values. We can spread the + // final adjustment proportionately across both NumRange and NumEq. adjustmentFactorDistinctRange := float64(1) if distinctCountRange > 0 { adjustmentFactorDistinctRange = (distinctCountTotal - distinctCountEq) / distinctCountRange diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go index c40ec5f8551b..54a74b3eeabe 100644 --- a/pkg/sql/stats/histogram_test.go +++ b/pkg/sql/stats/histogram_test.go @@ -524,6 +524,50 @@ func TestAdjustCounts(t *testing.T) { {NumRange: 1551.19, NumEq: 3447.09, DistinctRange: 450, UpperBound: f(1000)}, }, }, + { // Zero rowCount and distinctCount. + h: []cat.HistogramBucket{ + {NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)}, + }, + rowCount: 0, + distinctCount: 0, + expected: []cat.HistogramBucket{}, + }, + { // Negative rowCount and distinctCount. + h: []cat.HistogramBucket{ + {NumRange: 0, NumEq: 1, DistinctRange: 0, UpperBound: f(1)}, + }, + rowCount: -100, + distinctCount: -90, + expected: []cat.HistogramBucket{}, + }, + { // Empty initial histogram. + h: []cat.HistogramBucket{}, + rowCount: 1000, + distinctCount: 1000, + expected: []cat.HistogramBucket{}, + }, + { // Empty bucket in initial histogram. + h: []cat.HistogramBucket{ + {NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)}, + }, + rowCount: 99, + distinctCount: 99, + expected: []cat.HistogramBucket{}, + }, + { // All zero NumEq. + h: []cat.HistogramBucket{ + {NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)}, + {NumRange: 10, NumEq: 0, DistinctRange: 5, UpperBound: f(100)}, + {NumRange: 10, NumEq: 0, DistinctRange: 10, UpperBound: f(200)}, + }, + rowCount: 100, + distinctCount: 60, + expected: []cat.HistogramBucket{ + {NumRange: 0, NumEq: 0, DistinctRange: 0, UpperBound: f(1)}, + {NumRange: 50, NumEq: 0, DistinctRange: 27.5, UpperBound: f(100)}, + {NumRange: 50, NumEq: 0, DistinctRange: 32.5, UpperBound: f(200)}, + }, + }, } evalCtx := eval.MakeTestingEvalContext(cluster.MakeTestingClusterSettings())