Skip to content

Commit

Permalink
sql/stats: more fully undo addOuterBuckets in stripOuterBuckets
Browse files Browse the repository at this point in the history
Near the end of `stats.(*histogram).adjustCounts`, if there is leftover
DistinctCount after adjusting each histogram bucket, we call
`addOuterBuckets`. This function creates two "outer" buckets that fully
cover the lower and upper ends of the domain with the remaining
DistinctCount.

These "outer" buckets interfere with partial stats, so we remove them
with `stripOuterBuckets` when merging partial stats. `stripOuterBuckets`
was simply slicing off the two buckets, but this could potentially leave
non-zero DistinctCount in the first bucket (which is added by
`addOuterBuckets`). `stripOuterBuckets` needs to also zero the
DistinctCount and NumRange of the first bucket to fully undo the actions
of `addOuterBuckets`.

We were only catching this when trying to forecast using the merged
histogram, which would sometimes produce a forecasted histogram with
non-zero DistinctRange in the first bucket, leading to the infamous
"histogram had first bucket with non-zero NumRange or DistinctRange"
Sentry failure.

Unfortunately, I don't think this bug fully explains all of the Sentry
failures that look like cockroachdb#93892 because many of those predate the
introduction of partial stats. I believe this specific failure started
with v24.3.0-alpha.0.

Informs: cockroachdb#93892
Fixes: cockroachdb#134031

(No release note because automatic collection of partial stats is not yet
enabled in any available release.)

Release note: None
  • Loading branch information
michae2 authored and sambhav-jain-16 committed Nov 20, 2024
1 parent b1f8ff5 commit 6435cc1
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 2 deletions.
200 changes: 200 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -3412,3 +3412,203 @@ pstat_allindex_partial {a} 3 3 0
pstat_allindex_partial {b} 3 3 0
pstat_allindex_partial {crdb_internal_idx_expr} 3 3 0
pstat_allindex_partial {d} 3 3 0

# Regression test for #134031. Check that we still have a valid, forecast-able
# histogram after merging partial stats with full stats that have been modified
# by addOuterBucket.

statement ok
CREATE TABLE t134031 (
a INT PRIMARY KEY
) WITH (sql_stats_automatic_collection_enabled = false, sql_stats_histogram_buckets_count = 4)

# These stats were created with the following statements:
#
# INSERT INTO t134031 SELECT 2 * i * i FROM generate_series(0, 1000) s(i);
# CREATE STATISTICS __auto__ FROM t134031;
#
# INSERT INTO t134031 VALUES (2000002);
# SELECT pg_sleep(1);
# CREATE STATISTICS __auto__ FROM t134031;
#
# INSERT INTO t134031 VALUES (2000004);
# SELECT pg_sleep(1);
# CREATE STATISTICS __auto__ FROM t134031;
#
# INSERT INTO t134031 VALUES (3000000);
# SELECT pg_sleep(2);
# CREATE STATISTICS __auto_partial__ FROM t134031 USING EXTREMES;

statement ok
ALTER TABLE t134031 INJECT STATISTICS '[
{
"avg_size": 4,
"columns": [
"a"
],
"created_at": "2024-11-15 17:38:35.191236",
"distinct_count": 1001,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 1,
"num_range": 0,
"upper_bound": "0"
},
{
"distinct_range": 331.79445036468786,
"num_eq": 1,
"num_range": 332,
"upper_bound": "221778"
},
{
"distinct_range": 332.043798577731,
"num_eq": 1,
"num_range": 332,
"upper_bound": "887112"
},
{
"distinct_range": 333.16175105758106,
"num_eq": 1,
"num_range": 333,
"upper_bound": "2000000"
}
],
"histo_col_type": "INT8",
"histo_version": 3,
"id": 1021122691703046145,
"name": "__auto__",
"null_count": 0,
"row_count": 1001
},
{
"avg_size": 4,
"columns": [
"a"
],
"created_at": "2024-11-15 17:38:36.240863",
"distinct_count": 1002,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 1,
"num_range": 0,
"upper_bound": "0"
},
{
"distinct_range": 331.7944820909316,
"num_eq": 1,
"num_range": 332,
"upper_bound": "221778"
},
{
"distinct_range": 333.0442332658522,
"num_eq": 1,
"num_range": 333,
"upper_bound": "889778"
},
{
"distinct_range": 333.16128464321633,
"num_eq": 1,
"num_range": 333,
"upper_bound": "2000002"
}
],
"histo_col_type": "INT8",
"histo_version": 3,
"id": 1021122695141097473,
"name": "__auto__",
"null_count": 0,
"row_count": 1002
},
{
"avg_size": 4,
"columns": [
"a"
],
"created_at": "2024-11-15 17:38:37.296779",
"distinct_count": 1003,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 0,
"num_range": 0,
"upper_bound": "-9223372036854775808"
},
{
"distinct_range": 5.684341886080802E-14,
"num_eq": 1,
"num_range": 0,
"upper_bound": "0"
},
{
"distinct_range": 332.7947242432267,
"num_eq": 1,
"num_range": 333,
"upper_bound": "223112"
},
{
"distinct_range": 333.0446396262168,
"num_eq": 1,
"num_range": 333,
"upper_bound": "892448"
},
{
"distinct_range": 333.16063613055655,
"num_eq": 1,
"num_range": 333,
"upper_bound": "2000004"
},
{
"distinct_range": 5.684341886080185E-14,
"num_eq": 0,
"num_range": 0,
"upper_bound": "9223372036854775807"
}
],
"histo_col_type": "INT8",
"histo_version": 3,
"id": 1021122698602217473,
"name": "__auto__",
"null_count": 0,
"row_count": 1003
},
{
"avg_size": 4,
"columns": [
"a"
],
"created_at": "2024-11-15 17:38:39.337031",
"distinct_count": 1,
"full_statistic_id": 1021122698602217473,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 1,
"num_range": 0,
"upper_bound": "3000000"
}
],
"histo_col_type": "INT8",
"histo_version": 3,
"id": 1021122705285644289,
"name": "__auto_partial__",
"null_count": 0,
"partial_predicate": "(a IS NULL) OR ((a \u003c 0:::INT8) OR (a \u003e 2000004:::INT8))",
"row_count": 1
}
]'

# Make sure the first bucket of both merged and forecast histograms has 0 for
# distinct_range and num_range.
query TT
SELECT stat->>'name', stat->'histo_buckets'->0
FROM (
SELECT jsonb_array_elements(statistics) AS stat
FROM [SHOW STATISTICS USING JSON FOR TABLE t134031 WITH MERGE, FORECAST]
)
WHERE stat->'name' <@ '["__merged__", "__forecast__"]'
ORDER BY stat->>'name' DESC
----
__merged__ {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"}
__forecast__ {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"}
2 changes: 1 addition & 1 deletion pkg/sql/opt/exec/execbuilder/testdata/partial_stats
Original file line number Diff line number Diff line change
Expand Up @@ -1309,7 +1309,7 @@ LIMIT 3
"upper_bound": "9999"
}
{
"distinct_range": 49.59173044047086,
"distinct_range": 49.591730440470876,
"num_eq": 1,
"num_range": 50,
"upper_bound": "9998"
Expand Down
6 changes: 5 additions & 1 deletion pkg/sql/stats/merge.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ func stripOuterBuckets(
endIdx := len(histogram)
if histogram[0].UpperBound.IsMin(ctx, evalCtx) && histogram[0].NumEq == 0 {
startIdx = 1
// Set the first range counts to zero to counteract range counts added by
// addOuterBuckets.
histogram[startIdx].NumRange = 0
histogram[startIdx].DistinctRange = 0
}
if histogram[len(histogram)-1].UpperBound.IsMax(ctx, evalCtx) && histogram[len(histogram)-1].NumEq == 0 {
endIdx = len(histogram) - 1
Expand All @@ -102,7 +106,7 @@ func stripOuterBuckets(
// Histogram (format is: {NumEq, NumRange, DistinctRange, UpperBound}):
// [{1, 0, 0, 2}, {1, 0, 0, 3}, {1, 0, 0, 4}]
//
// Partial Statistic: {row, 8, dist: 4, null: 0, size: 1}
// Partial Statistic: {row: 8, dist: 4, null: 0, size: 1}
// CreatedAt: 2022-01-03
// Histogram: [{2, 0, 0, 0}, {2, 0, 0, 1}, {2, 0, 0, 5}, {2, 0, 0, 6}]
//
Expand Down

0 comments on commit 6435cc1

Please sign in to comment.