sql/stats: store non-NULL histograms for empty tables

We have been storing NULL histograms / using nil HistogramData for all the following cases: 1. regular stats, GenerateHistogram=false 2. regular stats, GenerateHistogram=true, empty table 3. regular stats, GenerateHistogram=true, all NULL values 4. inverted stats, no inverted index 5. inverted stats, yes inverted index, empty table 6. inverted stats, yes inverted index, all NULL values When predicting histograms for statistics forecasts, we need to distinguish between case 1 and cases 2 and 3. In case 1 we cannot predict histograms, but in cases 2 and 3 we can (and the emptiness of the histogram is important). So, for cases 2 and 3 we now store an empty histogram instead of NULL, and correspondingly use an initialized HistogramData with 0-length Buckets instead of nil HistogramData. This also helps with testing statistics forecasts. (The inability to distinguish cases 4-6 doesn't matter, because we cannot predict histograms for inverted indexes anyway. I tried to change cases 5 and 6 to be non-NULL for consistency but ran into some problems, so I'll leave them as they are.) Release note: None
cockroachdb · May 24, 2022 · 37a2e9a · 37a2e9a
1 parent e2c163b
commit 37a2e9a
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 3 deletions.
diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats
@@ -1219,6 +1219,7 @@ FROM [SHOW STATISTICS USING JSON FOR TABLE all_null]
         ],
         "distinct_count": 1,
         "histo_col_type": "INT8",
+        "histo_version": 1,
         "name": "s",
         "null_count": 1,
         "row_count": 1
@@ -1375,3 +1376,200 @@ ANALYZE system.jobs
 # Collecting stats on system.scheduled_jobs is disallowed.
 statement error pq: cannot create statistics on system.scheduled_jobs
 ANALYZE system.scheduled_jobs
+
+# Collecting stats on empty tables should result in empty (but not NULL)
+# histograms.
+statement ok
+CREATE TABLE tabula (r INT, a INT, sa INT, PRIMARY KEY (r), INDEX (a, sa))
+
+statement ok
+CREATE STATISTICS aristotle FROM tabula
+
+query TTIB colnames
+SELECT statistics_name, column_names, row_count, histogram_id IS NOT NULL AS has_histogram
+FROM [SHOW STATISTICS FOR TABLE tabula]
+ORDER BY statistics_name, column_names::STRING
+----
+statistics_name  column_names  row_count  has_histogram
+aristotle        {a,sa}        0          false
+aristotle        {a}           0          true
+aristotle        {r}           0          true
+aristotle        {sa}          0          true
+
+let $hist_id_1
+SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
+WHERE statistics_name = 'aristotle' AND column_names = '{a}'
+
+# This histogram should be empty.
+query TIRI colnames
+SHOW HISTOGRAM $hist_id_1
+----
+upper_bound  range_rows  distinct_range_rows  equal_rows
+
+query T
+SELECT jsonb_pretty(COALESCE(json_agg(stat), '[]'))
+  FROM (SELECT json_array_elements(statistics) - 'created_at' AS stat
+        FROM [SHOW STATISTICS USING JSON FOR TABLE tabula])
+----
+[
+    {
+        "avg_size": 0,
+        "columns": [
+            "r"
+        ],
+        "distinct_count": 0,
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "aristotle",
+        "null_count": 0,
+        "row_count": 0
+    },
+    {
+        "avg_size": 0,
+        "columns": [
+            "a"
+        ],
+        "distinct_count": 0,
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "aristotle",
+        "null_count": 0,
+        "row_count": 0
+    },
+    {
+        "avg_size": 0,
+        "columns": [
+            "sa"
+        ],
+        "distinct_count": 0,
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "aristotle",
+        "null_count": 0,
+        "row_count": 0
+    },
+    {
+        "avg_size": 0,
+        "columns": [
+            "a",
+            "sa"
+        ],
+        "distinct_count": 0,
+        "histo_col_type": "",
+        "name": "aristotle",
+        "null_count": 0,
+        "row_count": 0
+    }
+]
+
+# Collecting stats on columns with all NULL values should also result in empty
+# (but not NULL) histograms.
+statement ok
+INSERT INTO tabula VALUES (11, 12, NULL)
+
+statement ok
+CREATE STATISTICS locke FROM tabula
+
+query TTIIB colnames
+SELECT statistics_name, column_names, row_count, null_count, histogram_id IS NOT NULL AS has_histogram
+FROM [SHOW STATISTICS FOR TABLE tabula]
+ORDER BY statistics_name, column_names::STRING
+----
+statistics_name  column_names  row_count  null_count  has_histogram
+locke            {a,sa}        1          0           false
+locke            {a}           1          0           true
+locke            {r}           1          0           true
+locke            {sa}          1          1           true
+
+let $hist_id_1
+SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
+WHERE statistics_name = 'locke' AND column_names = '{a}'
+
+# This histogram should *not* be empty.
+query TIRI colnames
+SHOW HISTOGRAM $hist_id_1
+----
+upper_bound  range_rows  distinct_range_rows  equal_rows
+12           0           0                    1
+
+let $hist_id_1
+SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
+WHERE statistics_name = 'locke' AND column_names = '{sa}'
+
+# This histogram *should* be empty.
+query TIRI colnames
+SHOW HISTOGRAM $hist_id_1
+----
+upper_bound  range_rows  distinct_range_rows  equal_rows
+
+query T
+SELECT jsonb_pretty(COALESCE(json_agg(stat), '[]'))
+  FROM (SELECT json_array_elements(statistics) - 'created_at' AS stat
+        FROM [SHOW STATISTICS USING JSON FOR TABLE tabula])
+----
+[
+    {
+        "avg_size": 1,
+        "columns": [
+            "r"
+        ],
+        "distinct_count": 1,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "11"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "locke",
+        "null_count": 0,
+        "row_count": 1
+    },
+    {
+        "avg_size": 2,
+        "columns": [
+            "a"
+        ],
+        "distinct_count": 1,
+        "histo_buckets": [
+            {
+                "distinct_range": 0,
+                "num_eq": 1,
+                "num_range": 0,
+                "upper_bound": "12"
+            }
+        ],
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "locke",
+        "null_count": 0,
+        "row_count": 1
+    },
+    {
+        "avg_size": 0,
+        "columns": [
+            "sa"
+        ],
+        "distinct_count": 1,
+        "histo_col_type": "INT8",
+        "histo_version": 1,
+        "name": "locke",
+        "null_count": 1,
+        "row_count": 1
+    },
+    {
+        "avg_size": 2,
+        "columns": [
+            "a",
+            "sa"
+        ],
+        "distinct_count": 1,
+        "histo_col_type": "",
+        "name": "locke",
+        "null_count": 0,
+        "row_count": 1
+    }
+]
diff --git a/pkg/sql/rowexec/sample_aggregator.go b/pkg/sql/rowexec/sample_aggregator.go
@@ -436,7 +436,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
 	if err := s.FlowCtx.Cfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
 		for _, si := range s.sketches {
 			var histogram *stats.HistogramData
-			if si.spec.GenerateHistogram && len(s.sr.Get()) != 0 {
+			if si.spec.GenerateHistogram {
 				colIdx := int(si.spec.Columns[0])
 				typ := s.inTypes[colIdx]
 

diff --git a/pkg/sql/stats/histogram.go b/pkg/sql/stats/histogram.go
@@ -87,7 +87,9 @@ func EquiDepthHistogram(
 ) (HistogramData, []cat.HistogramBucket, error) {
 	numSamples := len(samples)
 	if numSamples == 0 {
-		return HistogramData{ColumnType: colType}, nil, nil
+		return HistogramData{
+			ColumnType: colType, Buckets: make([]HistogramData_Bucket, 0), Version: histVersion,
+		}, nil, nil
 	}
 	if maxBuckets < 2 {
 		return HistogramData{}, nil, errors.Errorf("histogram requires at least two buckets")

diff --git a/pkg/sql/stats/histogram_test.go b/pkg/sql/stats/histogram_test.go
@@ -245,6 +245,12 @@ func TestEquiDepthHistogram(t *testing.T) {
 			if err != nil {
 				t.Fatal(err)
 			}
+			if h.Version != histVersion {
+				t.Errorf("Invalid histogram version %d expected %d", h.Version, histVersion)
+			}
+			if (h.Buckets == nil) != (tc.buckets == nil) {
+				t.Fatalf("Invalid bucket == nil: %v, expected %v", h.Buckets == nil, tc.buckets == nil)
+			}
 			if len(h.Buckets) != len(tc.buckets) {
 				t.Fatalf("Invalid number of buckets %d, expected %d", len(h.Buckets), len(tc.buckets))
 			}

diff --git a/pkg/sql/stats/json.go b/pkg/sql/stats/json.go
@@ -130,7 +130,7 @@ func (js *JSONStatistic) DecodeAndSetHistogram(
 func (js *JSONStatistic) GetHistogram(
 	semaCtx *tree.SemaContext, evalCtx *eval.Context,
 ) (*HistogramData, error) {
-	if len(js.HistogramBuckets) == 0 {
+	if js.HistogramColumnType == "" {
 		return nil, nil
 	}
 	h := &HistogramData{}