Skip to content

Commit

Permalink
opt: add cluster setting to enable histograms
Browse files Browse the repository at this point in the history
This commit adds the cluster setting sql.stats.histogram_collection.enabled
to enable or disable histogram collection. When enabled, histograms are
collected by default for all index columns (specifically the first column
in each index) during automatic statistics collection. If a single column
statistic is explicitly requested using manual invocation of CREATE STATISTICS,
a histogram will be collected, regardless of whether or not the column is part
of an index.

When sql.stats.histogram_collection.enabled is disabled, histograms are never
collected, either as part of automatic statistics collection or manual
invocation of CREATE STATISTICS.

sql.stats.histogram_collection.enabled is currently disabled by default.

Release note (sql change): Added cluster setting
sql.stats.histogram_collection.enabled to enable collection of histograms
during statistics collection.
  • Loading branch information
rytaft committed Jul 13, 2019
1 parent 092e60a commit 8994904
Show file tree
Hide file tree
Showing 9 changed files with 15,465 additions and 284 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
<tr><td><code>sql.stats.automatic_collection.fraction_stale_rows</code></td><td>float</td><td><code>0.2</code></td><td>target fraction of stale rows per table that will trigger a statistics refresh</td></tr>
<tr><td><code>sql.stats.automatic_collection.max_fraction_idle</code></td><td>float</td><td><code>0.9</code></td><td>maximum fraction of time that automatic statistics sampler processors are idle</td></tr>
<tr><td><code>sql.stats.automatic_collection.min_stale_rows</code></td><td>integer</td><td><code>500</code></td><td>target minimum number of stale rows per table that will trigger a statistics refresh</td></tr>
<tr><td><code>sql.stats.histogram_collection.enabled</code></td><td>boolean</td><td><code>false</code></td><td>histogram collection mode</td></tr>
<tr><td><code>sql.stats.max_timestamp_age</code></td><td>duration</td><td><code>5m0s</code></td><td>maximum age of timestamp during table statistics collection</td></tr>
<tr><td><code>sql.stats.post_events.enabled</code></td><td>boolean</td><td><code>false</code></td><td>if set, an event is shown for every CREATE STATISTICS job</td></tr>
<tr><td><code>sql.tablecache.lease.refresh_limit</code></td><td>integer</td><td><code>50</code></td><td>maximum number of tables to periodically refresh leases for</td></tr>
Expand Down
477 changes: 257 additions & 220 deletions pkg/jobs/jobspb/jobs.pb.go

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions pkg/jobs/jobspb/jobs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,18 @@ message ChangefeedProgress {
// collects table statistics, which contain info such as the number of rows in
// the table or the number of distinct values in a column.
message CreateStatsDetails {
message ColList {
repeated uint32 ids = 1 [
(gogoproto.customname) = "IDs",
message ColStat {
repeated uint32 column_ids = 1 [
(gogoproto.customname) = "ColumnIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/sql/sqlbase.ColumnID"
];

// Indicates whether this column stat should include a histogram.
bool has_histogram = 2;
}
string name = 1;
sqlbase.TableDescriptor table = 2 [(gogoproto.nullable) = false];
repeated ColList column_lists = 3 [(gogoproto.nullable) = false];
repeated ColStat column_stats = 3 [(gogoproto.nullable) = false];
string statement = 4;
util.hlc.Timestamp as_of = 5;
double max_fraction_idle = 7;
Expand Down
40 changes: 25 additions & 15 deletions pkg/sql/create_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
}

// Identify which columns we should create statistics for.
var createStatsColLists []jobspb.CreateStatsDetails_ColList
var colStats []jobspb.CreateStatsDetails_ColStat
if len(n.ColumnNames) == 0 {
if createStatsColLists, err = createStatsDefaultColumns(tableDesc); err != nil {
if colStats, err = createStatsDefaultColumns(tableDesc); err != nil {
return nil, err
}
} else {
Expand All @@ -202,7 +202,12 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
}
columnIDs[i] = columns[i].ID
}
createStatsColLists = []jobspb.CreateStatsDetails_ColList{{IDs: columnIDs}}
colStats = []jobspb.CreateStatsDetails_ColStat{{ColumnIDs: columnIDs, HasHistogram: false}}
if len(columnIDs) == 1 {
// By default, create histograms on all explicitly requested column stats
// with a single column.
colStats[0].HasHistogram = true
}
}

// Evaluate the AS OF time, if any.
Expand Down Expand Up @@ -235,7 +240,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
Name: string(n.Name),
FQTableName: fqTableName,
Table: tableDesc.TableDescriptor,
ColumnLists: createStatsColLists,
ColumnStats: colStats,
Statement: n.String(),
AsOf: asOf,
MaxFractionIdle: n.Options.Throttling,
Expand All @@ -260,20 +265,23 @@ const maxNonIndexCols = 100
// collect statistics on a, {a, b}, b, and {b, c}.
//
// In addition to the index columns, we collect stats on up to maxNonIndexCols
// other columns from the table.
// other columns from the table. We only collect histograms for index columns.
//
// TODO(rytaft): This currently only generates one single-column stat per
// index. Add code to collect multi-column stats once they are supported.
func createStatsDefaultColumns(
desc *ImmutableTableDescriptor,
) ([]jobspb.CreateStatsDetails_ColList, error) {
columns := make([]jobspb.CreateStatsDetails_ColList, 0, len(desc.Indexes)+1)
) ([]jobspb.CreateStatsDetails_ColStat, error) {
colStats := make([]jobspb.CreateStatsDetails_ColStat, 0, len(desc.Indexes)+1)

var requestedCols util.FastIntSet

// Add a column for the primary key.
pkCol := desc.PrimaryIndex.ColumnIDs[0]
columns = append(columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{pkCol}})
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: []sqlbase.ColumnID{pkCol},
HasHistogram: true,
})
requestedCols.Add(int(pkCol))

// Add columns for each secondary index.
Expand All @@ -284,9 +292,10 @@ func createStatsDefaultColumns(
}
idxCol := desc.Indexes[i].ColumnIDs[0]
if !requestedCols.Contains(int(idxCol)) {
columns = append(
columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{idxCol}},
)
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: []sqlbase.ColumnID{idxCol},
HasHistogram: true,
})
requestedCols.Add(int(idxCol))
}
}
Expand All @@ -296,14 +305,15 @@ func createStatsDefaultColumns(
for i := 0; i < len(desc.Columns) && nonIdxCols < maxNonIndexCols; i++ {
col := &desc.Columns[i]
if col.Type.Family() != types.JsonFamily && !requestedCols.Contains(int(col.ID)) {
columns = append(
columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{col.ID}},
)
colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
ColumnIDs: []sqlbase.ColumnID{col.ID},
HasHistogram: false,
})
nonIdxCols++
}
}

return columns, nil
return colStats, nil
}

// makePlanForExplainDistSQL is part of the distSQLExplainable interface.
Expand Down
9 changes: 4 additions & 5 deletions pkg/sql/distsql_plan_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,12 @@ func (dsp *DistSQLPlanner) createPlanForCreateStats(
planCtx *PlanningCtx, job *jobs.Job,
) (PhysicalPlan, error) {
details := job.Details().(jobspb.CreateStatsDetails)
reqStats := make([]requestedStat, len(details.ColumnLists))
reqStats := make([]requestedStat, len(details.ColumnStats))
histogramCollectionEnabled := stats.HistogramClusterMode.Get(&dsp.st.SV)
for i := 0; i < len(reqStats); i++ {
// Currently we do not use histograms, so don't bother creating one.
// When this changes, we can only use it for single-column stats.
histogram := false
histogram := details.ColumnStats[i].HasHistogram && histogramCollectionEnabled
reqStats[i] = requestedStat{
columns: details.ColumnLists[i].IDs,
columns: details.ColumnStats[i].ColumnIDs,
histogram: histogram,
histogramMaxBuckets: histogramBuckets,
name: details.Name,
Expand Down
120 changes: 80 additions & 40 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
statement ok
SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false

statement ok
SET CLUSTER SETTING sql.stats.histogram_collection.enabled = false

statement ok
CREATE TABLE data (a INT, b INT, c FLOAT, d DECIMAL, PRIMARY KEY (a, b, c, d), INDEX c_idx (c, d))

Expand Down Expand Up @@ -43,41 +46,68 @@ NULL /1 {1} 1
statement ok
CREATE STATISTICS s1 ON a FROM data

query TTIII colnames
SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
query TTIIII colnames
SELECT statistics_name, column_names, row_count, distinct_count, null_count, histogram_id
FROM [SHOW STATISTICS FOR TABLE data]
----
statistics_name column_names row_count distinct_count null_count
s1 {a} 10000 10 0
statistics_name column_names row_count distinct_count null_count histogram_id
s1 {a} 10000 10 0 NULL

# TODO(radu): reenable when we support histograms.
#
# let $hist_id_1
# SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'
#
# query TII colnames
# SHOW HISTOGRAM $hist_id_1
# ----
# upper_bound range_rows equal_rows
# 1 0 1000
# 2 0 1000
# 3 0 1000
# 4 0 1000
# 5 0 1000
# 6 0 1000
# 7 0 1000
# 8 0 1000
# 9 0 1000
# 10 0 1000
statement ok
SET CLUSTER SETTING sql.stats.histogram_collection.enabled = true

statement ok
CREATE STATISTICS s1 ON a FROM data

query TTIIIB colnames
SELECT
statistics_name,
column_names,
row_count,
distinct_count,
null_count,
histogram_id IS NOT NULL AS has_histogram
FROM
[SHOW STATISTICS FOR TABLE data];
----
statistics_name column_names row_count distinct_count null_count has_histogram
s1 {a} 10000 10 0 true

let $hist_id_1
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'

query TII colnames
SHOW HISTOGRAM $hist_id_1
----
upper_bound range_rows equal_rows
1 0 1000
2 0 1000
3 0 1000
4 0 1000
5 0 1000
6 0 1000
7 0 1000
8 0 1000
9 0 1000
10 0 1000

statement ok
CREATE STATISTICS "" ON b FROM data

query TTIII colnames
SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
query TTIIIB colnames
SELECT
statistics_name,
column_names,
row_count,
distinct_count,
null_count,
histogram_id IS NOT NULL AS has_histogram
FROM
[SHOW STATISTICS FOR TABLE data];
----
statistics_name column_names row_count distinct_count null_count
s1 {a} 10000 10 0
NULL {b} 10000 10 0
statistics_name column_names row_count distinct_count null_count has_histogram
s1 {a} 10000 10 0 true
NULL {b} 10000 10 0 true

# Verify that we can package statistics into a json object and later restore them.
let $json_stats
Expand All @@ -89,12 +119,20 @@ DELETE FROM system.table_statistics
statement ok
ALTER TABLE data INJECT STATISTICS '$json_stats'

query TTIII colnames
SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
query TTIIIB colnames
SELECT
statistics_name,
column_names,
row_count,
distinct_count,
null_count,
histogram_id IS NOT NULL AS has_histogram
FROM
[SHOW STATISTICS FOR TABLE data];
----
statistics_name column_names row_count distinct_count null_count
s1 {a} 10000 10 0
NULL {b} 10000 10 0
statistics_name column_names row_count distinct_count null_count has_histogram
s1 {a} 10000 10 0 true
NULL {b} 10000 10 0 true

# Verify that any other statistics are blown away when we INJECT.
statement ok
Expand Down Expand Up @@ -140,16 +178,18 @@ s2 {a} 10000 10 0
statement ok
CREATE STATISTICS s3 FROM data

query TIII colnames
SELECT column_names, row_count, distinct_count, null_count
# With default column statistics, only index columns have a histogram_id
# (specifically the first column in each index).
query TIIIB colnames
SELECT column_names, row_count, distinct_count, null_count, histogram_id IS NOT NULL AS has_histogram
FROM [SHOW STATISTICS FOR TABLE data]
WHERE statistics_name = 's3'
----
column_names row_count distinct_count null_count
{a} 10000 10 0
{c} 10000 10 0
{b} 10000 10 0
{d} 10000 10 0
column_names row_count distinct_count null_count has_histogram
{a} 10000 10 0 true
{c} 10000 10 0 true
{b} 10000 10 0 false
{d} 10000 10 0 false

# Add indexes, including duplicate index on column c.
statement ok
Expand Down
Loading

0 comments on commit 8994904

Please sign in to comment.