Merge #38812

38812: opt: add cluster setting to enable histograms r=rytaft a=rytaft This commit adds the cluster setting `sql.stats.histogram_collection.enabled` to enable or disable histogram collection. When enabled, histograms are collected by default for all index columns (specifically the first column in each index) during automatic statistics collection. If a single column statistic is explicitly requested using manual invocation of `CREATE STATISTICS`, a histogram will be collected, regardless of whether or not the column is part of an index. When `sql.stats.histogram_collection.enabled` is disabled, histograms are never collected, either as part of automatic statistics collection or manual invocation of `CREATE STATISTICS`. `sql.stats.histogram_collection.enabled` is currently disabled by default. Release note (sql change): Added cluster setting sql.stats.histogram_collection.enabled to enable collection of histograms during statistics collection. Co-authored-by: Rebecca Taft <[email protected]>
cockroachdb · Jul 15, 2019 · 71d49de · 71d49de
2 parents 1fa1205 + 8994904
commit 71d49de
Show file tree

Hide file tree

Showing 9 changed files with 15,465 additions and 284 deletions.
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -107,6 +107,7 @@
 <tr><td><code>sql.stats.automatic_collection.fraction_stale_rows</code></td><td>float</td><td><code>0.2</code></td><td>target fraction of stale rows per table that will trigger a statistics refresh</td></tr>
 <tr><td><code>sql.stats.automatic_collection.max_fraction_idle</code></td><td>float</td><td><code>0.9</code></td><td>maximum fraction of time that automatic statistics sampler processors are idle</td></tr>
 <tr><td><code>sql.stats.automatic_collection.min_stale_rows</code></td><td>integer</td><td><code>500</code></td><td>target minimum number of stale rows per table that will trigger a statistics refresh</td></tr>
+<tr><td><code>sql.stats.histogram_collection.enabled</code></td><td>boolean</td><td><code>false</code></td><td>histogram collection mode</td></tr>
 <tr><td><code>sql.stats.max_timestamp_age</code></td><td>duration</td><td><code>5m0s</code></td><td>maximum age of timestamp during table statistics collection</td></tr>
 <tr><td><code>sql.stats.post_events.enabled</code></td><td>boolean</td><td><code>false</code></td><td>if set, an event is shown for every CREATE STATISTICS job</td></tr>
 <tr><td><code>sql.tablecache.lease.refresh_limit</code></td><td>integer</td><td><code>50</code></td><td>maximum number of tables to periodically refresh leases for</td></tr>

diff --git a/pkg/jobs/jobspb/jobs.pb.go b/pkg/jobs/jobspb/jobs.pb.go
diff --git a/pkg/jobs/jobspb/jobs.proto b/pkg/jobs/jobspb/jobs.proto
@@ -200,15 +200,18 @@ message ChangefeedProgress {
 // collects table statistics, which contain info such as the number of rows in
 // the table or the number of distinct values in a column.
 message CreateStatsDetails {
-  message ColList {
-    repeated uint32 ids = 1 [
-      (gogoproto.customname) = "IDs",
+  message ColStat {
+    repeated uint32 column_ids = 1 [
+      (gogoproto.customname) = "ColumnIDs",
       (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/sql/sqlbase.ColumnID"
     ];
+
+    // Indicates whether this column stat should include a histogram.
+    bool has_histogram = 2;
   }
   string name = 1;
   sqlbase.TableDescriptor table = 2 [(gogoproto.nullable) = false];
-  repeated ColList column_lists = 3 [(gogoproto.nullable) = false];
+  repeated ColStat column_stats = 3 [(gogoproto.nullable) = false];
   string statement = 4;
   util.hlc.Timestamp as_of = 5;
   double max_fraction_idle = 7;

diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go
@@ -183,9 +183,9 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
 	}
 
 	// Identify which columns we should create statistics for.
-	var createStatsColLists []jobspb.CreateStatsDetails_ColList
+	var colStats []jobspb.CreateStatsDetails_ColStat
 	if len(n.ColumnNames) == 0 {
-		if createStatsColLists, err = createStatsDefaultColumns(tableDesc); err != nil {
+		if colStats, err = createStatsDefaultColumns(tableDesc); err != nil {
 			return nil, err
 		}
 	} else {
@@ -202,7 +202,12 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
 			}
 			columnIDs[i] = columns[i].ID
 		}
-		createStatsColLists = []jobspb.CreateStatsDetails_ColList{{IDs: columnIDs}}
+		colStats = []jobspb.CreateStatsDetails_ColStat{{ColumnIDs: columnIDs, HasHistogram: false}}
+		if len(columnIDs) == 1 {
+			// By default, create histograms on all explicitly requested column stats
+			// with a single column.
+			colStats[0].HasHistogram = true
+		}
 	}
 
 	// Evaluate the AS OF time, if any.
@@ -235,7 +240,7 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro
 			Name:            string(n.Name),
 			FQTableName:     fqTableName,
 			Table:           tableDesc.TableDescriptor,
-			ColumnLists:     createStatsColLists,
+			ColumnStats:     colStats,
 			Statement:       n.String(),
 			AsOf:            asOf,
 			MaxFractionIdle: n.Options.Throttling,
@@ -260,20 +265,23 @@ const maxNonIndexCols = 100
 // collect statistics on a, {a, b}, b, and {b, c}.
 //
 // In addition to the index columns, we collect stats on up to maxNonIndexCols
-// other columns from the table.
+// other columns from the table. We only collect histograms for index columns.
 //
 // TODO(rytaft): This currently only generates one single-column stat per
 // index. Add code to collect multi-column stats once they are supported.
 func createStatsDefaultColumns(
 	desc *ImmutableTableDescriptor,
-) ([]jobspb.CreateStatsDetails_ColList, error) {
-	columns := make([]jobspb.CreateStatsDetails_ColList, 0, len(desc.Indexes)+1)
+) ([]jobspb.CreateStatsDetails_ColStat, error) {
+	colStats := make([]jobspb.CreateStatsDetails_ColStat, 0, len(desc.Indexes)+1)
 
 	var requestedCols util.FastIntSet
 
 	// Add a column for the primary key.
 	pkCol := desc.PrimaryIndex.ColumnIDs[0]
-	columns = append(columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{pkCol}})
+	colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
+		ColumnIDs:    []sqlbase.ColumnID{pkCol},
+		HasHistogram: true,
+	})
 	requestedCols.Add(int(pkCol))
 
 	// Add columns for each secondary index.
@@ -284,9 +292,10 @@ func createStatsDefaultColumns(
 		}
 		idxCol := desc.Indexes[i].ColumnIDs[0]
 		if !requestedCols.Contains(int(idxCol)) {
-			columns = append(
-				columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{idxCol}},
-			)
+			colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
+				ColumnIDs:    []sqlbase.ColumnID{idxCol},
+				HasHistogram: true,
+			})
 			requestedCols.Add(int(idxCol))
 		}
 	}
@@ -296,14 +305,15 @@ func createStatsDefaultColumns(
 	for i := 0; i < len(desc.Columns) && nonIdxCols < maxNonIndexCols; i++ {
 		col := &desc.Columns[i]
 		if col.Type.Family() != types.JsonFamily && !requestedCols.Contains(int(col.ID)) {
-			columns = append(
-				columns, jobspb.CreateStatsDetails_ColList{IDs: []sqlbase.ColumnID{col.ID}},
-			)
+			colStats = append(colStats, jobspb.CreateStatsDetails_ColStat{
+				ColumnIDs:    []sqlbase.ColumnID{col.ID},
+				HasHistogram: false,
+			})
 			nonIdxCols++
 		}
 	}
 
-	return columns, nil
+	return colStats, nil
 }
 
 // makePlanForExplainDistSQL is part of the distSQLExplainable interface.

diff --git a/pkg/sql/distsql_plan_stats.go b/pkg/sql/distsql_plan_stats.go
@@ -204,13 +204,12 @@ func (dsp *DistSQLPlanner) createPlanForCreateStats(
 	planCtx *PlanningCtx, job *jobs.Job,
 ) (PhysicalPlan, error) {
 	details := job.Details().(jobspb.CreateStatsDetails)
-	reqStats := make([]requestedStat, len(details.ColumnLists))
+	reqStats := make([]requestedStat, len(details.ColumnStats))
+	histogramCollectionEnabled := stats.HistogramClusterMode.Get(&dsp.st.SV)
 	for i := 0; i < len(reqStats); i++ {
-		// Currently we do not use histograms, so don't bother creating one.
-		// When this changes, we can only use it for single-column stats.
-		histogram := false
+		histogram := details.ColumnStats[i].HasHistogram && histogramCollectionEnabled
 		reqStats[i] = requestedStat{
-			columns:             details.ColumnLists[i].IDs,
+			columns:             details.ColumnStats[i].ColumnIDs,
 			histogram:           histogram,
 			histogramMaxBuckets: histogramBuckets,
 			name:                details.Name,

diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats
@@ -4,6 +4,9 @@
 statement ok
 SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false
 
+statement ok
+SET CLUSTER SETTING sql.stats.histogram_collection.enabled = false
+
 statement ok
 CREATE TABLE data (a INT, b INT, c FLOAT, d DECIMAL, PRIMARY KEY (a, b, c, d), INDEX c_idx (c, d))
 
@@ -43,41 +46,68 @@ NULL       /1       {1}       1
 statement ok
 CREATE STATISTICS s1 ON a FROM data
 
-query TTIII colnames
-SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
+query TTIIII colnames
+SELECT statistics_name, column_names, row_count, distinct_count, null_count, histogram_id
+FROM [SHOW STATISTICS FOR TABLE data]
 ----
-statistics_name  column_names  row_count  distinct_count  null_count
-s1               {a}           10000      10              0
+statistics_name  column_names  row_count  distinct_count  null_count  histogram_id
+s1               {a}           10000      10              0           NULL
 
-# TODO(radu): reenable when we support histograms.
-#
-# let $hist_id_1
-# SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'
-# 
-# query TII colnames
-# SHOW HISTOGRAM $hist_id_1
-# ----
-# upper_bound  range_rows  equal_rows
-# 1            0           1000
-# 2            0           1000
-# 3            0           1000
-# 4            0           1000
-# 5            0           1000
-# 6            0           1000
-# 7            0           1000
-# 8            0           1000
-# 9            0           1000
-# 10           0           1000
+statement ok
+SET CLUSTER SETTING sql.stats.histogram_collection.enabled = true
+
+statement ok
+CREATE STATISTICS s1 ON a FROM data
+
+query TTIIIB colnames
+SELECT
+	statistics_name,
+	column_names,
+	row_count,
+	distinct_count,
+	null_count,
+	histogram_id IS NOT NULL AS has_histogram
+FROM
+	[SHOW STATISTICS FOR TABLE data];
+----
+statistics_name  column_names  row_count  distinct_count  null_count  has_histogram
+s1               {a}           10000      10              0           true
+
+let $hist_id_1
+SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE data] WHERE statistics_name = 's1'
+
+query TII colnames
+SHOW HISTOGRAM $hist_id_1
+----
+upper_bound  range_rows  equal_rows
+1            0           1000
+2            0           1000
+3            0           1000
+4            0           1000
+5            0           1000
+6            0           1000
+7            0           1000
+8            0           1000
+9            0           1000
+10           0           1000
 
 statement ok
 CREATE STATISTICS "" ON b FROM data
 
-query TTIII colnames
-SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
+query TTIIIB colnames
+SELECT
+	statistics_name,
+	column_names,
+	row_count,
+	distinct_count,
+	null_count,
+	histogram_id IS NOT NULL AS has_histogram
+FROM
+	[SHOW STATISTICS FOR TABLE data];
 ----
-statistics_name  column_names  row_count  distinct_count  null_count
-s1               {a}           10000      10              0
-NULL             {b}           10000      10              0
+statistics_name  column_names  row_count  distinct_count  null_count  has_histogram
+s1               {a}           10000      10              0           true
+NULL             {b}           10000      10              0           true
 
 # Verify that we can package statistics into a json object and later restore them.
 let $json_stats
@@ -89,12 +119,20 @@ DELETE FROM system.table_statistics
 statement ok
 ALTER TABLE data INJECT STATISTICS '$json_stats'
 
-query TTIII colnames
-SELECT statistics_name, column_names, row_count, distinct_count, null_count FROM [SHOW STATISTICS FOR TABLE data]
+query TTIIIB colnames
+SELECT
+	statistics_name,
+	column_names,
+	row_count,
+	distinct_count,
+	null_count,
+	histogram_id IS NOT NULL AS has_histogram
+FROM
+	[SHOW STATISTICS FOR TABLE data];
 ----
-statistics_name  column_names  row_count  distinct_count  null_count
-s1               {a}           10000      10              0
-NULL             {b}           10000      10              0
+statistics_name  column_names  row_count  distinct_count  null_count  has_histogram
+s1               {a}           10000      10              0           true
+NULL             {b}           10000      10              0           true
 
 # Verify that any other statistics are blown away when we INJECT.
 statement ok
@@ -140,16 +178,18 @@ s2               {a}           10000      10              0
 statement ok
 CREATE STATISTICS s3 FROM data
 
-query TIII colnames
-SELECT column_names, row_count, distinct_count, null_count
+# With default column statistics, only index columns have a histogram_id
+# (specifically the first column in each index).
+query TIIIB colnames
+SELECT column_names, row_count, distinct_count, null_count, histogram_id IS NOT NULL AS has_histogram
 FROM [SHOW STATISTICS FOR TABLE data]
 WHERE statistics_name = 's3'
 ----
-column_names  row_count  distinct_count  null_count
-{a}           10000      10              0
-{c}           10000      10              0
-{b}           10000      10              0
-{d}           10000      10              0
+column_names  row_count  distinct_count  null_count  has_histogram
+{a}           10000      10              0           true
+{c}           10000      10              0           true
+{b}           10000      10              0           false
+{d}           10000      10              0           false
 
 # Add indexes, including duplicate index on column c.
 statement ok