Merge cockroachdb#135310

135310: sql/stats: more fully undo addOuterBuckets in stripOuterBuckets r=yuzefovich,mgartner a=michae2 Near the end of `stats.(*histogram).adjustCounts`, if there is leftover DistinctCount after adjusting each histogram bucket, we call `addOuterBuckets`. This function creates two "outer" buckets that fully cover the lower and upper ends of the domain with the remaining DistinctCount. These "outer" buckets interfere with partial stats, so we remove them with `stripOuterBuckets` when merging partial stats. `stripOuterBuckets` was simply slicing off the two buckets, but this could potentially leave non-zero DistinctCount in the first bucket (which is added by `addOuterBuckets`). `stripOuterBuckets` needs to also zero the DistinctCount and NumRange of the first bucket to fully undo the actions of `addOuterBuckets`. We were only catching this when trying to forecast using the merged histogram, which would sometimes produce a forecasted histogram with non-zero DistinctRange in the first bucket, leading to the infamous "histogram had first bucket with non-zero NumRange or DistinctRange" Sentry failure. Unfortunately, I don't think this bug fully explains all of the Sentry failures that look like cockroachdb#93892 because many of those predate the introduction of partial stats. I believe this specific failure started with v24.3.0-alpha.0. Informs: cockroachdb#93892 Fixes: cockroachdb#134031 (No release note because automatic collection of partial stats is not yet enabled in any available release.) Release note: None Co-authored-by: Michael Erickson <[email protected]>
spilchen · Nov 20, 2024 · 8cdc891 · 8cdc891
2 parents ba472fd + 90e311d
commit 8cdc891
Show file tree

Hide file tree

Showing 3 changed files with 206 additions and 2 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats
@@ -3412,3 +3412,203 @@ pstat_allindex_partial  {a}                       3          3               0
 pstat_allindex_partial  {b}                       3          3               0
 pstat_allindex_partial  {crdb_internal_idx_expr}  3          3               0
 pstat_allindex_partial  {d}                       3          3               0
+
+# Regression test for #134031. Check that we still have a valid, forecast-able
+# histogram after merging partial stats with full stats that have been modified
+# by addOuterBucket.
+
+statement ok
+CREATE TABLE t134031 (
+  a INT PRIMARY KEY
+) WITH (sql_stats_automatic_collection_enabled = false, sql_stats_histogram_buckets_count = 4)
+
+# These stats were created with the following statements:
+#
+#   INSERT INTO t134031 SELECT 2 * i * i FROM generate_series(0, 1000) s(i);
+#   CREATE STATISTICS __auto__ FROM t134031;
+#
+#   INSERT INTO t134031 VALUES (2000002);
+#   SELECT pg_sleep(1);
+#   CREATE STATISTICS __auto__ FROM t134031;
+#
+#   INSERT INTO t134031 VALUES (2000004);
+#   SELECT pg_sleep(1);
+#   CREATE STATISTICS __auto__ FROM t134031;
+#
+#   INSERT INTO t134031 VALUES (3000000);
+#   SELECT pg_sleep(2);
+#   CREATE STATISTICS __auto_partial__ FROM t134031 USING EXTREMES;
+
+statement ok
+ALTER TABLE t134031 INJECT STATISTICS '[
+    {
+        "avg_size": 4,
+        "columns": [
+            "a"
+        ],
+        "created_at": "2024-11-15 17:38:35.191236",
+        "distinct_count": 1001,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "0"
+            },
+            {
+                "distinct_range": 331.79445036468786,
+                "num_eq": 1,
+                "num_range": 332,
+                "upper_bound": "221778"
+            },
+            {
+                "distinct_range": 332.043798577731,
+                "num_eq": 1,
+                "num_range": 332,
+                "upper_bound": "887112"
+            },
+            {
+                "distinct_range": 333.16175105758106,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "2000000"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 3,
+        "id": 1021122691703046145,
+        "name": "__auto__",
+        "null_count": 0,
+        "row_count": 1001
+    },
+    {
+        "avg_size": 4,
+        "columns": [
+            "a"
+        ],
+        "created_at": "2024-11-15 17:38:36.240863",
+        "distinct_count": 1002,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "0"
+            },
+            {
+                "distinct_range": 331.7944820909316,
+                "num_eq": 1,
+                "num_range": 332,
+                "upper_bound": "221778"
+            },
+            {
+                "distinct_range": 333.0442332658522,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "889778"
+            },
+            {
+                "distinct_range": 333.16128464321633,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "2000002"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 3,
+        "id": 1021122695141097473,
+        "name": "__auto__",
+        "null_count": 0,
+        "row_count": 1002
+    },
+    {
+        "avg_size": 4,
+        "columns": [
+            "a"
+        ],
+        "created_at": "2024-11-15 17:38:37.296779",
+        "distinct_count": 1003,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 0,
+                "num_range": 0,
+                "upper_bound": "-9223372036854775808"
+            },
+            {
+                "distinct_range": 5.684341886080802E-14,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "0"
+            },
+            {
+                "distinct_range": 332.7947242432267,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "223112"
+            },
+            {
+                "distinct_range": 333.0446396262168,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "892448"
+            },
+            {
+                "distinct_range": 333.16063613055655,
+                "num_eq": 1,
+                "num_range": 333,
+                "upper_bound": "2000004"
+            },
+            {
+                "distinct_range": 5.684341886080185E-14,
+                "num_eq": 0,
+                "num_range": 0,
+                "upper_bound": "9223372036854775807"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 3,
+        "id": 1021122698602217473,
+        "name": "__auto__",
+        "null_count": 0,
+        "row_count": 1003
+    },
+    {
+        "avg_size": 4,
+        "columns": [
+            "a"
+        ],
+        "created_at": "2024-11-15 17:38:39.337031",
+        "distinct_count": 1,
+        "full_statistic_id": 1021122698602217473,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "3000000"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 3,
+        "id": 1021122705285644289,
+        "name": "__auto_partial__",
+        "null_count": 0,
+        "partial_predicate": "(a IS NULL) OR ((a \u003c 0:::INT8) OR (a \u003e 2000004:::INT8))",
+        "row_count": 1
+    }
+]'
+
+# Make sure the first bucket of both merged and forecast histograms has 0 for
+# distinct_range and num_range.
+query TT
+SELECT stat->>'name', stat->'histo_buckets'->0
+FROM (
+  SELECT jsonb_array_elements(statistics) AS stat
+  FROM [SHOW STATISTICS USING JSON FOR TABLE t134031 WITH MERGE, FORECAST]
+)
+WHERE stat->'name' <@ '["__merged__", "__forecast__"]'
+ORDER BY stat->>'name' DESC
+----
+__merged__    {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"}
+__forecast__  {"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "0"}
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/partial_stats b/pkg/sql/opt/exec/execbuilder/testdata/partial_stats
@@ -1309,7 +1309,7 @@ LIMIT 3
     "upper_bound": "9999"
 }
 {
-    "distinct_range": 49.59173044047086,
+    "distinct_range": 49.591730440470876,
     "num_eq": 1,
     "num_range": 50,
     "upper_bound": "9998"

diff --git a/pkg/sql/stats/merge.go b/pkg/sql/stats/merge.go
@@ -79,6 +79,10 @@ func stripOuterBuckets(
 	endIdx := len(histogram)
 	if histogram[0].UpperBound.IsMin(ctx, evalCtx) && histogram[0].NumEq == 0 {
 		startIdx = 1
+		// Set the first range counts to zero to counteract range counts added by
+		// addOuterBuckets.
+		histogram[startIdx].NumRange = 0
+		histogram[startIdx].DistinctRange = 0
 	}
 	if histogram[len(histogram)-1].UpperBound.IsMax(ctx, evalCtx) && histogram[len(histogram)-1].NumEq == 0 {
 		endIdx = len(histogram) - 1
@@ -102,7 +106,7 @@ func stripOuterBuckets(
 // Histogram (format is: {NumEq, NumRange, DistinctRange, UpperBound}):
 // [{1, 0, 0, 2}, {1, 0, 0, 3}, {1, 0, 0, 4}]
 //
-// Partial Statistic: {row, 8, dist: 4, null: 0, size: 1}
+// Partial Statistic: {row: 8, dist: 4, null: 0, size: 1}
 // CreatedAt: 2022-01-03
 // Histogram: [{2, 0, 0, 0}, {2, 0, 0, 1}, {2, 0, 0, 5}, {2, 0, 0, 6}]
 //