From fe3cc5f5e8a5312de4e1b2db0e34847ac8b7aa70 Mon Sep 17 00:00:00 2001 From: Mark Sirek Date: Mon, 25 Apr 2022 11:17:44 -0700 Subject: [PATCH] stats: enable optimizer stats use on system tables for query planning Informs #80123 Previously, statistics could be collected on system tables, but their use in the optimizer for planning queries was disabled. This was inadequate because without stats, full table scan might be chosen by the optimizer for queries involving system tables in cases where index access is actually cheaper. Also, it is confusing that one could ANALYZE a system table, but the collected statistics would not be used when querying the table, as shown in the EXPLAIN output. To address this, this patch enables stats usage on all system tables except system.table_statistics and system.lease, as doing so may cause hangs. A new tenant-writable cluster setting is added, `sql.stats.system_tables.enabled`, which when true allows statistics on system tables to be used by the optimizer for costing and planning queries. The default value of this setting is `true`. Collection of statistics on `system.table_statistics` and `system.lease` has been disabled because it would be confusing to allow stats to be collected that would never be used by the optimizer. Release note: none --- .../settings/settings-for-tenants.txt | 1 + docs/generated/settings/settings.html | 1 + .../testdata/logic_test/crdb_internal_tenant | 1 + pkg/sql/create_stats.go | 13 ++++++++ .../testdata/logic_test/distsql_stats | 22 +++++++++++++ pkg/sql/stats/automatic_stats.go | 16 ++++++++- pkg/sql/stats/stats_cache.go | 33 +++++++++++++++++-- 7 files changed, 83 insertions(+), 4 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index 651c41902924..9a07ace80897 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -173,6 +173,7 @@ sql.stats.multi_column_collection.enabled boolean true multi-column statistics c sql.stats.persisted_rows.max integer 1000000 maximum number of rows of statement and transaction statistics that will be persisted in the system tables sql.stats.post_events.enabled boolean false if set, an event is logged for every CREATE STATISTICS job sql.stats.response.max integer 20000 the maximum number of statements and transaction stats returned in a CombinedStatements request +sql.stats.system_tables.enabled boolean true when true, enables use of statistics on system tables by the query optimizer sql.telemetry.query_sampling.enabled boolean false when set to true, executed queries will emit an event on the telemetry logging channel sql.temp_object_cleaner.cleanup_interval duration 30m0s how often to clean up orphaned temporary objects sql.temp_object_cleaner.wait_interval duration 30m0s how long after creation a temporary object will be cleaned up diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 2f5ac811597d..2680ed606c13 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -189,6 +189,7 @@ sql.stats.persisted_rows.maxinteger1000000maximum number of rows of statement and transaction statistics that will be persisted in the system tables sql.stats.post_events.enabledbooleanfalseif set, an event is logged for every CREATE STATISTICS job sql.stats.response.maxinteger20000the maximum number of statements and transaction stats returned in a CombinedStatements request +sql.stats.system_tables.enabledbooleantruewhen true, enables use of statistics on system tables by the query optimizer sql.telemetry.query_sampling.enabledbooleanfalsewhen set to true, executed queries will emit an event on the telemetry logging channel sql.temp_object_cleaner.cleanup_intervalduration30m0show often to clean up orphaned temporary objects sql.temp_object_cleaner.wait_intervalduration30m0show long after creation a temporary object will be cleaned up diff --git a/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant b/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant index daeaa524be6b..651716bc0e34 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant +++ b/pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant @@ -18,6 +18,7 @@ SELECT node_id, name FROM crdb_internal.leases ORDER BY name 0 scheduled_jobs 0 settings 0 statement_diagnostics_requests +0 table_statistics 0 test 0 users diff --git a/pkg/sql/create_stats.go b/pkg/sql/create_stats.go index c4bdd560cd53..a6000c26c790 100644 --- a/pkg/sql/create_stats.go +++ b/pkg/sql/create_stats.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/featureflag" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" @@ -229,6 +230,18 @@ func (n *createStatsNode) makeJobRecord(ctx context.Context) (*jobs.Record, erro ) } + if tableDesc.GetID() == keys.TableStatisticsTableID { + return nil, pgerror.New( + pgcode.WrongObjectType, "cannot create statistics on system.table_statistics", + ) + } + + if tableDesc.GetID() == keys.LeaseTableID { + return nil, pgerror.New( + pgcode.WrongObjectType, "cannot create statistics on system.lease", + ) + } + if err := n.p.CheckPrivilege(ctx, tableDesc, privilege.SELECT); err != nil { return nil, err } diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index 6f06afd9b229..869a41935836 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -1345,3 +1345,25 @@ CREATE TABLE t76867 ( statement ok ANALYZE t76867 + +# Regression tests for #80123. Collecting stats on system tables is allowed. +statement ok +ANALYZE system.locations + +# EXPLAIN output should indicate stats collected on system.locations. +query T +SELECT * FROM [EXPLAIN SELECT * FROM system.locations] OFFSET 2 +---- +· +• scan + estimated row count: 5 (100% of the table; stats collected ago) + table: locations@primary + spans: FULL SCAN + +# Collecting stats on system.lease is disallowed. +statement error pq: cannot create statistics on system.lease +ANALYZE system.lease + +# Collecting stats on system.table_statistics is disallowed. +statement error pq: cannot create statistics on system.table_statistics +ANALYZE system.table_statistics diff --git a/pkg/sql/stats/automatic_stats.go b/pkg/sql/stats/automatic_stats.go index 36c9093d2210..2986a2f11906 100644 --- a/pkg/sql/stats/automatic_stats.go +++ b/pkg/sql/stats/automatic_stats.go @@ -38,6 +38,10 @@ import ( // cluster setting. const AutoStatsClusterSettingName = "sql.stats.automatic_collection.enabled" +// UseStatsOnSystemTables is the name of the use statistics on system tables +// cluster setting. +const UseStatsOnSystemTables = "sql.stats.system_tables.enabled" + // AutomaticStatisticsClusterMode controls the cluster setting for enabling // automatic table statistics collection. var AutomaticStatisticsClusterMode = settings.RegisterBoolSetting( @@ -47,6 +51,16 @@ var AutomaticStatisticsClusterMode = settings.RegisterBoolSetting( true, ).WithPublic() +// UseStatisticsOnSystemTables controls the cluster setting for enabling +// statistics usage by the optimizer for planning queries involving system +// tables. +var UseStatisticsOnSystemTables = settings.RegisterBoolSetting( + settings.TenantWritable, + UseStatsOnSystemTables, + "when true, enables use of statistics on system tables by the query optimizer", + true, +).WithPublic() + // MultiColumnStatisticsClusterMode controls the cluster setting for enabling // automatic collection of multi-column statistics. var MultiColumnStatisticsClusterMode = settings.RegisterBoolSetting( @@ -389,7 +403,7 @@ func (r *Refresher) NotifyMutation(table catalog.TableDescriptor, rowsAffected i // Automatic stats are disabled. return } - if !hasStatistics(table) { + if !autostatsCollectionAllowed(table) { // Don't collect stats for this kind of table: system, virtual, view, etc. return } diff --git a/pkg/sql/stats/stats_cache.go b/pkg/sql/stats/stats_cache.go index 1d072e9ea997..6fcb55777d43 100644 --- a/pkg/sql/stats/stats_cache.go +++ b/pkg/sql/stats/stats_cache.go @@ -214,19 +214,46 @@ func decodeTableStatisticsKV( func (sc *TableStatisticsCache) GetTableStats( ctx context.Context, table catalog.TableDescriptor, ) ([]*TableStatistic, error) { - if !hasStatistics(table) { + if !statsUsageAllowed(table, sc.Settings) { return nil, nil } return sc.getTableStatsFromCache(ctx, table.GetID()) } -// hasStatistics returns true if the table can have statistics collected for it. -func hasStatistics(table catalog.TableDescriptor) bool { +// statsUsageAllowed returns true if statistics on `table` are allowed to be +// used by the query optimizer. +func statsUsageAllowed(table catalog.TableDescriptor, clusterSettings *cluster.Settings) bool { + if catalog.IsSystemDescriptor(table) { + // Disable stats usage on system.table_statistics and system.lease. Looking + // up stats on system.lease is known to cause hangs, and the same could + // happen with system.table_statistics. + if table.GetID() == keys.TableStatisticsTableID || + table.GetID() == keys.LeaseTableID { + return false + } + // Return whether the optimizer is allowed to use stats on system tables. + return UseStatisticsOnSystemTables.Get(&clusterSettings.SV) + } + return tableTypeCanHaveStats(table) +} + +// autostatsCollectionAllowed returns true if statistics are allowed to be +// automatically collected on the table. +func autostatsCollectionAllowed(table catalog.TableDescriptor) bool { if catalog.IsSystemDescriptor(table) { // Don't try to get statistics for system tables (most importantly, // for table_statistics itself). return false } + return tableTypeCanHaveStats(table) +} + +// tableTypeCanHaveStats returns true if manual collection of statistics on the +// table type via CREATE STATISTICS or ANALYZE is allowed. Note that specific +// system tables may have stats collection disabled in create_stats.go. This +// function just indicates if the type of table may have stats manually +// collected. +func tableTypeCanHaveStats(table catalog.TableDescriptor) bool { if table.IsVirtualTable() { // Don't try to get statistics for virtual tables. return false