From af1866ec32602a90b0fd06f8085e5fc061a0e169 Mon Sep 17 00:00:00 2001 From: Azhng Date: Thu, 14 Oct 2021 22:46:23 -0400 Subject: [PATCH] sql: remove diagnostics.sql_stat_reset.interval Previously, the SQL Stats system reset its in-memory SQL Stats periodically, determined by the diagnostics.sql_stat_reset.interval cluster setting. This periodic reset loop was introduced early in 21.1 prior to persisted SQL Stats. In 21.2, we introduced persisted SQL Stats and along with a perodic task to flush in-memory SQL Stats into system tables. These two async tasks' coexistence introduced unwanted race conditions. If the SQL Stats reset task reset the in-memory store before the flusher could flush it to the system table, those particular stats would be lost forever. This commit changes the .Start() method in PersistedSQLStats to avoid this race condition. However, reported SQL Stats still periodically resets its in-memory storage in case the telemetry server goes offline. As the result, diagnostics.sql_stats_reset.interval cluster setting is removed. diagnostics.forced_sql_stats_reset.interval cluster setting now only controls the reset of the *reported* SQL Stats if it is not collected by the telemetry reporter. Release note (sql change): Previously, in-memory SQL Stats was reset periodically. This behavior is now removed since persisted SQL Stats is introduced in 21.2. As the result, diagnostics.sql_stats_reset.interval cluster setting is removed. diagnostics.forced_sql_stats_reset.interval now only controls the reset of the reported SQL Stats if it is not collected by the telemetry reporter. --- .../settings/settings-for-tenants.txt | 3 +-- docs/generated/settings/settings.html | 3 +-- pkg/server/stats_test.go | 2 +- pkg/settings/registry.go | 1 + pkg/sql/conn_executor.go | 11 ++++------- pkg/sql/sqlstats/cluster_settings.go | 16 +++------------- .../sqlstats/persistedsqlstats/provider.go | 1 - pkg/sql/sqlstats/sslocal/sql_stats.go | 6 ------ pkg/sql/sqlstats/sslocal/sql_stats_test.go | 1 - pkg/sql/sqlstats/sslocal/sslocal_provider.go | 19 +++---------------- 10 files changed, 14 insertions(+), 49 deletions(-) diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index cc5072a805b3..f67f8f89b23c 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -11,10 +11,9 @@ cloudstorage.http.custom_ca string custom root CA (appended to system's default cloudstorage.timeout duration 10m0s the timeout for import/export storage operations cluster.organization string organization name cluster.preserve_downgrade_option string disable (automatic or manual) cluster version upgrade from the specified version until reset -diagnostics.forced_sql_stat_reset.interval duration 2h0m0s interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H. +diagnostics.forced_sql_stat_reset.interval duration 2h0m0s interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H. diagnostics.reporting.enabled boolean true enable reporting diagnostic metrics to cockroach labs diagnostics.reporting.interval duration 1h0m0s interval at which diagnostics data should be reported -diagnostics.sql_stat_reset.interval duration 1h0m0s interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H. enterprise.license string the encoded cluster license external.graphite.endpoint string if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port external.graphite.interval duration 10s the interval at which metrics are pushed to Graphite (if enabled) diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 7b4d00058fc9..b55645d5e701 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -14,10 +14,9 @@ cluster.organizationstringorganization name cluster.preserve_downgrade_optionstringdisable (automatic or manual) cluster version upgrade from the specified version until reset diagnostics.active_query_dumps.enabledbooleantrueexperimental: enable dumping of anonymized active queries to disk when node is under memory pressure -diagnostics.forced_sql_stat_reset.intervalduration2h0m0sinterval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H. +diagnostics.forced_sql_stat_reset.intervalduration2h0m0sinterval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H. diagnostics.reporting.enabledbooleantrueenable reporting diagnostic metrics to cockroach labs diagnostics.reporting.intervalduration1h0m0sinterval at which diagnostics data should be reported -diagnostics.sql_stat_reset.intervalduration1h0m0sinterval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H. enterprise.licensestringthe encoded cluster license external.graphite.endpointstringif nonempty, push server metrics to the Graphite or Carbon server at the specified host:port external.graphite.intervalduration10sthe interval at which metrics are pushed to Graphite (if enabled) diff --git a/pkg/server/stats_test.go b/pkg/server/stats_test.go index 81f4cd991152..cd165579ef79 100644 --- a/pkg/server/stats_test.go +++ b/pkg/server/stats_test.go @@ -140,7 +140,7 @@ func TestEnsureSQLStatsAreFlushedForTelemetry(t *testing.T) { statusServer := s.(*TestServer).status sqlServer := s.(*TestServer).Server.sqlServer.pgServer.SQLServer - sqlServer.GetSQLStatsController().ResetLocalSQLStats(ctx) + sqlServer.GetSQLStatsProvider().(*persistedsqlstats.PersistedSQLStats).Flush(ctx) testutils.SucceedsSoon(t, func() error { // Get the diagnostic info. res, err := statusServer.Diagnostics(ctx, &serverpb.DiagnosticsRequest{NodeId: "local"}) diff --git a/pkg/settings/registry.go b/pkg/settings/registry.go index 6221d7c61736..5ece20241cf6 100644 --- a/pkg/settings/registry.go +++ b/pkg/settings/registry.go @@ -103,6 +103,7 @@ var retiredSettings = map[string]struct{}{ "kv.closed_timestamp.close_fraction": {}, "sql.telemetry.query_sampling.qps_threshold": {}, "sql.telemetry.query_sampling.sample_rate": {}, + "diagnostics.sql_stat_reset.interval": {}, // removed as of 22.1. "sql.defaults.drop_enum_value.enabled": {}, diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index f632c492da4c..8008106237f5 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -326,7 +326,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server { metrics.StatsMetrics.ReportedSQLStatsMemoryCurBytesCount, metrics.StatsMetrics.ReportedSQLStatsMemoryMaxBytesHist, pool, - nil, /* resetInterval */ nil, /* reportedProvider */ cfg.SQLStatsTestingKnobs, ) @@ -339,7 +338,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server { metrics.StatsMetrics.SQLStatsMemoryCurBytesCount, metrics.StatsMetrics.SQLStatsMemoryMaxBytesHist, pool, - sqlstats.SQLStatReset, reportedSQLStats, cfg.SQLStatsTestingKnobs, ) @@ -446,12 +444,11 @@ func makeMetrics(cfg *ExecutorConfig, internal bool) Metrics { // Start starts the Server's background processing. func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) { s.indexUsageStats.Start(ctx, stopper) - // Start a loop to clear SQL stats at the max reset interval. This is - // to ensure that we always have some worker clearing SQL stats to avoid - // continually allocating space for the SQL stats. Additionally, spawn - // a loop to clear the reported stats at the same large interval just - // in case the telemetry worker fails. s.sqlStats.Start(ctx, stopper) + + // reportedStats is periodically cleared to prevent too many SQL Stats + // accumulated in the reporter when the telemetry server fails. + // Usually it is telemetry's reporter's job to clear the reporting SQL Stats. s.reportedStats.Start(ctx, stopper) } diff --git a/pkg/sql/sqlstats/cluster_settings.go b/pkg/sql/sqlstats/cluster_settings.go index d966b6d8819c..8655dd0e68b6 100644 --- a/pkg/sql/sqlstats/cluster_settings.go +++ b/pkg/sql/sqlstats/cluster_settings.go @@ -137,22 +137,12 @@ var MaxSQLStatsStmtFingerprintsPerExplicitTxn = settings.RegisterIntSetting( 2000, ) -// SQLStatReset is the cluster setting that controls at what interval SQL -// statement statistics should be reset. -var SQLStatReset = settings.RegisterDurationSetting( - "diagnostics.sql_stat_reset.interval", - "interval controlling how often SQL statement statistics should "+ - "be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.", - time.Hour, - settings.NonNegativeDurationWithMaximum(time.Hour*24), -).WithPublic() - // MaxSQLStatReset is the cluster setting that controls at what interval SQL // statement statistics must be flushed within. var MaxSQLStatReset = settings.RegisterDurationSetting( "diagnostics.forced_sql_stat_reset.interval", - "interval after which SQL statement statistics are refreshed even "+ - "if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.", - time.Hour*2, // 2 x diagnostics.sql_stat_reset.interval + "interval after which the reported SQL Stats are reset even "+ + "if not collected by telemetry reporter. It has a max value of 24H.", + time.Hour*2, settings.NonNegativeDurationWithMaximum(time.Hour*24), ).WithPublic() diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index 9e9f8ad7285e..911d00b6c8ed 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -110,7 +110,6 @@ func New(cfg *Config, memSQLStats *sslocal.SQLStats) *PersistedSQLStats { // Start implements sqlstats.Provider interface. func (s *PersistedSQLStats) Start(ctx context.Context, stopper *stop.Stopper) { - s.SQLStats.Start(ctx, stopper) s.startSQLStatsFlushLoop(ctx, stopper) s.jobMonitor.start(ctx, stopper) } diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go index f609c8a4e559..be91205b8ce2 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats.go @@ -38,10 +38,6 @@ type SQLStats struct { // fingerprints we can store in memory. uniqueTxnFingerprintLimit *settings.IntSetting - // resetInterval is the interval of how long before the in-memory stats are - // being reset. - resetInterval *settings.DurationSetting - mu struct { syncutil.Mutex @@ -78,7 +74,6 @@ func newSQLStats( curMemBytesCount *metric.Gauge, maxMemBytesHist *metric.Histogram, parentMon *mon.BytesMonitor, - resetInterval *settings.DurationSetting, flushTarget Sink, knobs *sqlstats.TestingKnobs, ) *SQLStats { @@ -95,7 +90,6 @@ func newSQLStats( st: st, uniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit, uniqueTxnFingerprintLimit: uniqueTxnFingerprintLimit, - resetInterval: resetInterval, flushTarget: flushTarget, knobs: knobs, } diff --git a/pkg/sql/sqlstats/sslocal/sql_stats_test.go b/pkg/sql/sqlstats/sslocal/sql_stats_test.go index 20525dc7dc2d..3ed7a0d390a6 100644 --- a/pkg/sql/sqlstats/sslocal/sql_stats_test.go +++ b/pkg/sql/sqlstats/sslocal/sql_stats_test.go @@ -437,7 +437,6 @@ func TestExplicitTxnFingerprintAccounting(t *testing.T) { nil, /* curMemoryBytesCount */ nil, /* maxMemoryBytesHist */ monitor, - nil, /* resetInterval */ nil, /* reportingSink */ nil, /* knobs */ ) diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go index 691ed72982fc..c79d7314b88d 100644 --- a/pkg/sql/sqlstats/sslocal/sslocal_provider.go +++ b/pkg/sql/sqlstats/sslocal/sslocal_provider.go @@ -38,12 +38,11 @@ func New( curMemoryBytesCount *metric.Gauge, maxMemoryBytesHist *metric.Histogram, pool *mon.BytesMonitor, - resetInterval *settings.DurationSetting, reportingSink Sink, knobs *sqlstats.TestingKnobs, ) *SQLStats { return newSQLStats(settings, maxStmtFingerprints, maxTxnFingerprints, - curMemoryBytesCount, maxMemoryBytesHist, pool, resetInterval, + curMemoryBytesCount, maxMemoryBytesHist, pool, reportingSink, knobs) } @@ -59,18 +58,6 @@ func (s *SQLStats) GetController( // Start implements sqlstats.Provider interface. func (s *SQLStats) Start(ctx context.Context, stopper *stop.Stopper) { - if s.resetInterval != nil { - s.periodicallyClearSQLStats(ctx, stopper, s.resetInterval) - } - // Start a loop to clear SQL stats at the max reset interval. This is - // to ensure that we always have some worker clearing SQL stats to avoid - // continually allocating space for the SQL stats. - s.periodicallyClearSQLStats(ctx, stopper, sqlstats.MaxSQLStatReset) -} - -func (s *SQLStats) periodicallyClearSQLStats( - ctx context.Context, stopper *stop.Stopper, resetInterval *settings.DurationSetting, -) { // We run a periodic async job to clean up the in-memory stats. _ = stopper.RunAsyncTask(ctx, "sql-stats-clearer", func(ctx context.Context) { var timer timeutil.Timer @@ -79,13 +66,13 @@ func (s *SQLStats) periodicallyClearSQLStats( last := s.mu.lastReset s.mu.Unlock() - next := last.Add(resetInterval.Get(&s.st.SV)) + next := last.Add(sqlstats.MaxSQLStatReset.Get(&s.st.SV)) wait := next.Sub(timeutil.Now()) if wait < 0 { err := s.Reset(ctx) if err != nil { if log.V(1) { - log.Warningf(ctx, "reported SQL stats memory limit has been exceeded, some fingerprints stats are discarded: %s", err) + log.Warningf(ctx, "unexpected error: %s", err) } } } else {