From 1e5909bd8a3da288c8845837bbfaebaf471bf165 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Fri, 15 Nov 2024 10:29:45 -0800 Subject: [PATCH] sql/stats: more fully undo addOuterBuckets in stripOuterBuckets Near the end of `stats.(*histogram).adjustCounts`, if there is leftover DistinctCount after adjusting each histogram bucket, we call `addOuterBuckets`. This function creates two "outer" buckets that fully cover the lower and upper ends of the domain with the remaining DistinctCount. These "outer" buckets interfere with partial stats, so we remove them with `stripOuterBuckets` when merging partial stats. `stripOuterBuckets` was simply slicing off the two buckets, but this could potentially leave non-zero DistinctCount in the first bucket (which is added by `addOuterBuckets`). `stripOuterBuckets` needs to also zero the DistinctCount and NumRange of the first bucket to fully undo the actions of `addOuterBuckets`. We were only catching this when trying to forecast using the merged histogram, which would sometimes produce a forecasted histogram with non-zero DistinctRange in the first bucket, leading to the infamous "histogram had first bucket with non-zero NumRange or DistinctRange" Sentry failure. Unfortunately, I don't think this bug fully explains all of the Sentry failures that look like #93892 because many of those predate the introduction of partial stats. I believe this specific failure started with v24.3.0-alpha.0. Informs: #93892 Fixes: #134031 (No release note because automatic collection of partial stats is not yet enabled in any available release.) Release note: None --- .../testdata/logic_test/distsql_stats | 200 ++++++++++++++++++ pkg/sql/stats/merge.go | 6 +- 2 files changed, 205 insertions(+), 1 deletion(-) diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index de8c652acac9..fa34f6abaa89 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -3412,3 +3412,203 @@ pstat_allindex_partial {a} 3 3 0 pstat_allindex_partial {b} 3 3 0 pstat_allindex_partial {crdb_internal_idx_expr} 3 3 0 pstat_allindex_partial {d} 3 3 0 + +# Regression test for #134031. Check that we still have a valid, forecast-able +# histogram after merging partial stats with full stats that have been modified +# by addOuterBucket. + +statement ok +CREATE TABLE t134031 ( + a INT PRIMARY KEY +) WITH (sql_stats_automatic_collection_enabled = false, sql_stats_histogram_buckets_count = 4) + +# These stats were created with the following statements: +# +# INSERT INTO t134031 SELECT 2 * i * i FROM generate_series(0, 1000) s(i); +# CREATE STATISTICS __auto__ FROM t134031; +# +# INSERT INTO t134031 VALUES (2000002); +# SELECT pg_sleep(1); +# CREATE STATISTICS __auto__ FROM t134031; +# +# INSERT INTO t134031 VALUES (2000004); +# SELECT pg_sleep(1); +# CREATE STATISTICS __auto__ FROM t134031; +# +# INSERT INTO t134031 VALUES (3000000); +# SELECT pg_sleep(2); +# CREATE STATISTICS __auto_partial__ FROM t134031 USING EXTREMES; + +statement ok +ALTER TABLE t134031 INJECT STATISTICS '[ + { + "avg_size": 4, + "columns": [ + "a" + ], + "created_at": "2024-11-15 17:38:35.191236", + "distinct_count": 1001, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 331.79445036468786, + "num_eq": 1, + "num_range": 332, + "upper_bound": "221778" + }, + { + "distinct_range": 332.043798577731, + "num_eq": 1, + "num_range": 332, + "upper_bound": "887112" + }, + { + "distinct_range": 333.16175105758106, + "num_eq": 1, + "num_range": 333, + "upper_bound": "2000000" + } + ], + "histo_col_type": "INT8", + "histo_version": 3, + "id": 1021122691703046145, + "name": "__auto__", + "null_count": 0, + "row_count": 1001 + }, + { + "avg_size": 4, + "columns": [ + "a" + ], + "created_at": "2024-11-15 17:38:36.240863", + "distinct_count": 1002, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 331.7944820909316, + "num_eq": 1, + "num_range": 332, + "upper_bound": "221778" + }, + { + "distinct_range": 333.0442332658522, + "num_eq": 1, + "num_range": 333, + "upper_bound": "889778" + }, + { + "distinct_range": 333.16128464321633, + "num_eq": 1, + "num_range": 333, + "upper_bound": "2000002" + } + ], + "histo_col_type": "INT8", + "histo_version": 3, + "id": 1021122695141097473, + "name": "__auto__", + "null_count": 0, + "row_count": 1002 + }, + { + "avg_size": 4, + "columns": [ + "a" + ], + "created_at": "2024-11-15 17:38:37.296779", + "distinct_count": 1003, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 0, + "num_range": 0, + "upper_bound": "-9223372036854775808" + }, + { + "distinct_range": 5.684341886080802E-14, + "num_eq": 1, + "num_range": 0, + "upper_bound": "0" + }, + { + "distinct_range": 332.7947242432267, + "num_eq": 1, + "num_range": 333, + "upper_bound": "223112" + }, + { + "distinct_range": 333.0446396262168, + "num_eq": 1, + "num_range": 333, + "upper_bound": "892448" + }, + { + "distinct_range": 333.16063613055655, + "num_eq": 1, + "num_range": 333, + "upper_bound": "2000004" + }, + { + "distinct_range": 5.684341886080185E-14, + "num_eq": 0, + "num_range": 0, + "upper_bound": "9223372036854775807" + } + ], + "histo_col_type": "INT8", + "histo_version": 3, + "id": 1021122698602217473, + "name": "__auto__", + "null_count": 0, + "row_count": 1003 + }, + { + "avg_size": 4, + "columns": [ + "a" + ], + "created_at": "2024-11-15 17:38:39.337031", + "distinct_count": 1, + "full_statistic_id": 1021122698602217473, + "histo_buckets": [ + { + "distinct_range": 0, + "num_eq": 1, + "num_range": 0, + "upper_bound": "3000000" + } + ], + "histo_col_type": "INT8", + "histo_version": 3, + "id": 1021122705285644289, + "name": "__auto_partial__", + "null_count": 0, + "partial_predicate": "(a IS NULL) OR ((a \u003c 0:::INT8) OR (a \u003e 2000004:::INT8))", + "row_count": 1 + } +]' + +# Make sure the first bucket of both merged and forecast histograms has 0 for +# distinct_range and num_range. +query TT +SELECT stat->>'name', stat->'histo_buckets'->0 +FROM ( + SELECT jsonb_array_elements(statistics) AS stat + FROM [SHOW STATISTICS USING JSON FOR TABLE t134031 WITH MERGE, FORECAST] +) +WHERE stat->'name' <@ '["__merged__", "__forecast__"]' +ORDER BY stat->>'name' DESC +---- +__merged__ {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"} +__forecast__ {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"} diff --git a/pkg/sql/stats/merge.go b/pkg/sql/stats/merge.go index 8806ab355c9b..db6521be13ee 100644 --- a/pkg/sql/stats/merge.go +++ b/pkg/sql/stats/merge.go @@ -79,6 +79,10 @@ func stripOuterBuckets( endIdx := len(histogram) if histogram[0].UpperBound.IsMin(ctx, evalCtx) && histogram[0].NumEq == 0 { startIdx = 1 + // Set the first range counts to zero to counteract range counts added by + // addOuterBuckets. + histogram[startIdx].NumRange = 0 + histogram[startIdx].DistinctRange = 0 } if histogram[len(histogram)-1].UpperBound.IsMax(ctx, evalCtx) && histogram[len(histogram)-1].NumEq == 0 { endIdx = len(histogram) - 1 @@ -102,7 +106,7 @@ func stripOuterBuckets( // Histogram (format is: {NumEq, NumRange, DistinctRange, UpperBound}): // [{1, 0, 0, 2}, {1, 0, 0, 3}, {1, 0, 0, 4}] // -// Partial Statistic: {row, 8, dist: 4, null: 0, size: 1} +// Partial Statistic: {row: 8, dist: 4, null: 0, size: 1} // CreatedAt: 2022-01-03 // Histogram: [{2, 0, 0, 0}, {2, 0, 0, 1}, {2, 0, 0, 5}, {2, 0, 0, 6}] //