Skip to content

Commit

Permalink
sql/stats: store non-NULL histograms for empty tables
Browse files Browse the repository at this point in the history
We have been storing NULL histograms / using nil HistogramData for all
the following cases:
1. regular stats, GenerateHistogram=false
2. regular stats, GenerateHistogram=true, empty table
3. regular stats, GenerateHistogram=true, all NULL values
4. inverted stats, no inverted index
5. inverted stats, yes inverted index, empty table
6. inverted stats, yes inverted index, all NULL values

When predicting histograms for statistics forecasts, we need to
distinguish between case 1 and cases 2 and 3. In case 1 we cannot
predict histograms, but in cases 2 and 3 we can (and the emptiness of
the histogram is important).

So, for cases 2 and 3 we now store an empty histogram instead of NULL,
and correspondingly use an initialized HistogramData with 0-length
Buckets instead of nil HistogramData.

This also helps with testing statistics forecasts.

(The inability to distinguish cases 4-6 doesn't matter, because we
cannot predict histograms for inverted indexes anyway. I tried to change
cases 5 and 6 to be non-NULL for consistency but ran into some
problems, so I'll leave them as they are.)

Release note: None
  • Loading branch information
michae2 committed May 24, 2022
1 parent e2c163b commit 37a2e9a
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 3 deletions.
198 changes: 198 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,7 @@ FROM [SHOW STATISTICS USING JSON FOR TABLE all_null]
],
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 1,
"name": "s",
"null_count": 1,
"row_count": 1
Expand Down Expand Up @@ -1375,3 +1376,200 @@ ANALYZE system.jobs
# Collecting stats on system.scheduled_jobs is disallowed.
statement error pq: cannot create statistics on system.scheduled_jobs
ANALYZE system.scheduled_jobs

# Collecting stats on empty tables should result in empty (but not NULL)
# histograms.
statement ok
CREATE TABLE tabula (r INT, a INT, sa INT, PRIMARY KEY (r), INDEX (a, sa))

statement ok
CREATE STATISTICS aristotle FROM tabula

query TTIB colnames
SELECT statistics_name, column_names, row_count, histogram_id IS NOT NULL AS has_histogram
FROM [SHOW STATISTICS FOR TABLE tabula]
ORDER BY statistics_name, column_names::STRING
----
statistics_name column_names row_count has_histogram
aristotle {a,sa} 0 false
aristotle {a} 0 true
aristotle {r} 0 true
aristotle {sa} 0 true

let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
WHERE statistics_name = 'aristotle' AND column_names = '{a}'

# This histogram should be empty.
query TIRI colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows distinct_range_rows equal_rows

query T
SELECT jsonb_pretty(COALESCE(json_agg(stat), '[]'))
FROM (SELECT json_array_elements(statistics) - 'created_at' AS stat
FROM [SHOW STATISTICS USING JSON FOR TABLE tabula])
----
[
{
"avg_size": 0,
"columns": [
"r"
],
"distinct_count": 0,
"histo_col_type": "INT8",
"histo_version": 1,
"name": "aristotle",
"null_count": 0,
"row_count": 0
},
{
"avg_size": 0,
"columns": [
"a"
],
"distinct_count": 0,
"histo_col_type": "INT8",
"histo_version": 1,
"name": "aristotle",
"null_count": 0,
"row_count": 0
},
{
"avg_size": 0,
"columns": [
"sa"
],
"distinct_count": 0,
"histo_col_type": "INT8",
"histo_version": 1,
"name": "aristotle",
"null_count": 0,
"row_count": 0
},
{
"avg_size": 0,
"columns": [
"a",
"sa"
],
"distinct_count": 0,
"histo_col_type": "",
"name": "aristotle",
"null_count": 0,
"row_count": 0
}
]

# Collecting stats on columns with all NULL values should also result in empty
# (but not NULL) histograms.
statement ok
INSERT INTO tabula VALUES (11, 12, NULL)

statement ok
CREATE STATISTICS locke FROM tabula

query TTIIB colnames
SELECT statistics_name, column_names, row_count, null_count, histogram_id IS NOT NULL AS has_histogram
FROM [SHOW STATISTICS FOR TABLE tabula]
ORDER BY statistics_name, column_names::STRING
----
statistics_name column_names row_count null_count has_histogram
locke {a,sa} 1 0 false
locke {a} 1 0 true
locke {r} 1 0 true
locke {sa} 1 1 true

let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
WHERE statistics_name = 'locke' AND column_names = '{a}'

# This histogram should *not* be empty.
query TIRI colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows distinct_range_rows equal_rows
12 0 0 1

let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE tabula]
WHERE statistics_name = 'locke' AND column_names = '{sa}'

# This histogram *should* be empty.
query TIRI colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows distinct_range_rows equal_rows

query T
SELECT jsonb_pretty(COALESCE(json_agg(stat), '[]'))
FROM (SELECT json_array_elements(statistics) - 'created_at' AS stat
FROM [SHOW STATISTICS USING JSON FOR TABLE tabula])
----
[
{
"avg_size": 1,
"columns": [
"r"
],
"distinct_count": 1,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 1,
"num_range": 0,
"upper_bound": "11"
}
],
"histo_col_type": "INT8",
"histo_version": 1,
"name": "locke",
"null_count": 0,
"row_count": 1
},
{
"avg_size": 2,
"columns": [
"a"
],
"distinct_count": 1,
"histo_buckets": [
{
"distinct_range": 0,
"num_eq": 1,
"num_range": 0,
"upper_bound": "12"
}
],
"histo_col_type": "INT8",
"histo_version": 1,
"name": "locke",
"null_count": 0,
"row_count": 1
},
{
"avg_size": 0,
"columns": [
"sa"
],
"distinct_count": 1,
"histo_col_type": "INT8",
"histo_version": 1,
"name": "locke",
"null_count": 1,
"row_count": 1
},
{
"avg_size": 2,
"columns": [
"a",
"sa"
],
"distinct_count": 1,
"histo_col_type": "",
"name": "locke",
"null_count": 0,
"row_count": 1
}
]
2 changes: 1 addition & 1 deletion pkg/sql/rowexec/sample_aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ func (s *sampleAggregator) writeResults(ctx context.Context) error {
if err := s.FlowCtx.Cfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
for _, si := range s.sketches {
var histogram *stats.HistogramData
if si.spec.GenerateHistogram && len(s.sr.Get()) != 0 {
if si.spec.GenerateHistogram {
colIdx := int(si.spec.Columns[0])
typ := s.inTypes[colIdx]

Expand Down
4 changes: 3 additions & 1 deletion pkg/sql/stats/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ func EquiDepthHistogram(
) (HistogramData, []cat.HistogramBucket, error) {
numSamples := len(samples)
if numSamples == 0 {
return HistogramData{ColumnType: colType}, nil, nil
return HistogramData{
ColumnType: colType, Buckets: make([]HistogramData_Bucket, 0), Version: histVersion,
}, nil, nil
}
if maxBuckets < 2 {
return HistogramData{}, nil, errors.Errorf("histogram requires at least two buckets")
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/stats/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ func TestEquiDepthHistogram(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if h.Version != histVersion {
t.Errorf("Invalid histogram version %d expected %d", h.Version, histVersion)
}
if (h.Buckets == nil) != (tc.buckets == nil) {
t.Fatalf("Invalid bucket == nil: %v, expected %v", h.Buckets == nil, tc.buckets == nil)
}
if len(h.Buckets) != len(tc.buckets) {
t.Fatalf("Invalid number of buckets %d, expected %d", len(h.Buckets), len(tc.buckets))
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/stats/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (js *JSONStatistic) DecodeAndSetHistogram(
func (js *JSONStatistic) GetHistogram(
semaCtx *tree.SemaContext, evalCtx *eval.Context,
) (*HistogramData, error) {
if len(js.HistogramBuckets) == 0 {
if js.HistogramColumnType == "" {
return nil, nil
}
h := &HistogramData{}
Expand Down

0 comments on commit 37a2e9a

Please sign in to comment.