From af1866ec32602a90b0fd06f8085e5fc061a0e169 Mon Sep 17 00:00:00 2001
From: Azhng <archer.xn@gmail.com>
Date: Thu, 14 Oct 2021 22:46:23 -0400
Subject: [PATCH] sql: remove diagnostics.sql_stat_reset.interval

Previously, the SQL Stats system reset its in-memory SQL Stats periodically,
determined by the diagnostics.sql_stat_reset.interval cluster setting. This
periodic reset loop was introduced early in 21.1 prior to persisted SQL Stats.
In 21.2, we introduced persisted SQL Stats and along with a perodic task to
flush in-memory SQL Stats into system tables. These two async tasks'
coexistence introduced unwanted race conditions. If the SQL Stats reset task
reset the in-memory store before the flusher could flush it to the system
table, those particular stats would be lost forever.

This commit changes the .Start() method in PersistedSQLStats to avoid this
race condition. However, reported SQL Stats still periodically resets its
in-memory storage in case the telemetry server goes offline. As the result,
diagnostics.sql_stats_reset.interval cluster setting is removed.
diagnostics.forced_sql_stats_reset.interval cluster setting now only
controls the reset of the *reported* SQL Stats if it is not collected
by the telemetry reporter.

Release note (sql change): Previously, in-memory SQL Stats was reset
periodically. This behavior is now removed since persisted SQL Stats
is introduced in 21.2. As the result, diagnostics.sql_stats_reset.interval
cluster setting is removed. diagnostics.forced_sql_stats_reset.interval
now only controls the reset of the reported SQL Stats if it is not
collected by the telemetry reporter.
---
 .../settings/settings-for-tenants.txt         |  3 +--
 docs/generated/settings/settings.html         |  3 +--
 pkg/server/stats_test.go                      |  2 +-
 pkg/settings/registry.go                      |  1 +
 pkg/sql/conn_executor.go                      | 11 ++++-------
 pkg/sql/sqlstats/cluster_settings.go          | 16 +++-------------
 .../sqlstats/persistedsqlstats/provider.go    |  1 -
 pkg/sql/sqlstats/sslocal/sql_stats.go         |  6 ------
 pkg/sql/sqlstats/sslocal/sql_stats_test.go    |  1 -
 pkg/sql/sqlstats/sslocal/sslocal_provider.go  | 19 +++----------------
 10 files changed, 14 insertions(+), 49 deletions(-)

diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
index cc5072a805b3..f67f8f89b23c 100644
--- a/docs/generated/settings/settings-for-tenants.txt
+++ b/docs/generated/settings/settings-for-tenants.txt
@@ -11,10 +11,9 @@ cloudstorage.http.custom_ca	string		custom root CA (appended to system's default
 cloudstorage.timeout	duration	10m0s	the timeout for import/export storage operations
 cluster.organization	string		organization name
 cluster.preserve_downgrade_option	string		disable (automatic or manual) cluster version upgrade from the specified version until reset
-diagnostics.forced_sql_stat_reset.interval	duration	2h0m0s	interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.
+diagnostics.forced_sql_stat_reset.interval	duration	2h0m0s	interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H.
 diagnostics.reporting.enabled	boolean	true	enable reporting diagnostic metrics to cockroach labs
 diagnostics.reporting.interval	duration	1h0m0s	interval at which diagnostics data should be reported
-diagnostics.sql_stat_reset.interval	duration	1h0m0s	interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.
 enterprise.license	string		the encoded cluster license
 external.graphite.endpoint	string		if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port
 external.graphite.interval	duration	10s	the interval at which metrics are pushed to Graphite (if enabled)
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
index 7b4d00058fc9..b55645d5e701 100644
--- a/docs/generated/settings/settings.html
+++ b/docs/generated/settings/settings.html
@@ -14,10 +14,9 @@
 <tr><td><code>cluster.organization</code></td><td>string</td><td><code></code></td><td>organization name</td></tr>
 <tr><td><code>cluster.preserve_downgrade_option</code></td><td>string</td><td><code></code></td><td>disable (automatic or manual) cluster version upgrade from the specified version until reset</td></tr>
 <tr><td><code>diagnostics.active_query_dumps.enabled</code></td><td>boolean</td><td><code>true</code></td><td>experimental: enable dumping of anonymized active queries to disk when node is under memory pressure</td></tr>
-<tr><td><code>diagnostics.forced_sql_stat_reset.interval</code></td><td>duration</td><td><code>2h0m0s</code></td><td>interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.</td></tr>
+<tr><td><code>diagnostics.forced_sql_stat_reset.interval</code></td><td>duration</td><td><code>2h0m0s</code></td><td>interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H.</td></tr>
 <tr><td><code>diagnostics.reporting.enabled</code></td><td>boolean</td><td><code>true</code></td><td>enable reporting diagnostic metrics to cockroach labs</td></tr>
 <tr><td><code>diagnostics.reporting.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval at which diagnostics data should be reported</td></tr>
-<tr><td><code>diagnostics.sql_stat_reset.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.</td></tr>
 <tr><td><code>enterprise.license</code></td><td>string</td><td><code></code></td><td>the encoded cluster license</td></tr>
 <tr><td><code>external.graphite.endpoint</code></td><td>string</td><td><code></code></td><td>if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port</td></tr>
 <tr><td><code>external.graphite.interval</code></td><td>duration</td><td><code>10s</code></td><td>the interval at which metrics are pushed to Graphite (if enabled)</td></tr>
diff --git a/pkg/server/stats_test.go b/pkg/server/stats_test.go
index 81f4cd991152..cd165579ef79 100644
--- a/pkg/server/stats_test.go
+++ b/pkg/server/stats_test.go
@@ -140,7 +140,7 @@ func TestEnsureSQLStatsAreFlushedForTelemetry(t *testing.T) {
 
 	statusServer := s.(*TestServer).status
 	sqlServer := s.(*TestServer).Server.sqlServer.pgServer.SQLServer
-	sqlServer.GetSQLStatsController().ResetLocalSQLStats(ctx)
+	sqlServer.GetSQLStatsProvider().(*persistedsqlstats.PersistedSQLStats).Flush(ctx)
 	testutils.SucceedsSoon(t, func() error {
 		// Get the diagnostic info.
 		res, err := statusServer.Diagnostics(ctx, &serverpb.DiagnosticsRequest{NodeId: "local"})
diff --git a/pkg/settings/registry.go b/pkg/settings/registry.go
index 6221d7c61736..5ece20241cf6 100644
--- a/pkg/settings/registry.go
+++ b/pkg/settings/registry.go
@@ -103,6 +103,7 @@ var retiredSettings = map[string]struct{}{
 	"kv.closed_timestamp.close_fraction":                             {},
 	"sql.telemetry.query_sampling.qps_threshold":                     {},
 	"sql.telemetry.query_sampling.sample_rate":                       {},
+	"diagnostics.sql_stat_reset.interval":                            {},
 
 	// removed as of 22.1.
 	"sql.defaults.drop_enum_value.enabled":                             {},
diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go
index f632c492da4c..8008106237f5 100644
--- a/pkg/sql/conn_executor.go
+++ b/pkg/sql/conn_executor.go
@@ -326,7 +326,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
 		metrics.StatsMetrics.ReportedSQLStatsMemoryCurBytesCount,
 		metrics.StatsMetrics.ReportedSQLStatsMemoryMaxBytesHist,
 		pool,
-		nil, /* resetInterval */
 		nil, /* reportedProvider */
 		cfg.SQLStatsTestingKnobs,
 	)
@@ -339,7 +338,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
 		metrics.StatsMetrics.SQLStatsMemoryCurBytesCount,
 		metrics.StatsMetrics.SQLStatsMemoryMaxBytesHist,
 		pool,
-		sqlstats.SQLStatReset,
 		reportedSQLStats,
 		cfg.SQLStatsTestingKnobs,
 	)
@@ -446,12 +444,11 @@ func makeMetrics(cfg *ExecutorConfig, internal bool) Metrics {
 // Start starts the Server's background processing.
 func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) {
 	s.indexUsageStats.Start(ctx, stopper)
-	// Start a loop to clear SQL stats at the max reset interval. This is
-	// to ensure that we always have some worker clearing SQL stats to avoid
-	// continually allocating space for the SQL stats. Additionally, spawn
-	// a loop to clear the reported stats at the same large interval just
-	// in case the telemetry worker fails.
 	s.sqlStats.Start(ctx, stopper)
+
+	// reportedStats is periodically cleared to prevent too many SQL Stats
+	// accumulated in the reporter when the telemetry server fails.
+	// Usually it is telemetry's reporter's job to clear the reporting SQL Stats.
 	s.reportedStats.Start(ctx, stopper)
 }
 
diff --git a/pkg/sql/sqlstats/cluster_settings.go b/pkg/sql/sqlstats/cluster_settings.go
index d966b6d8819c..8655dd0e68b6 100644
--- a/pkg/sql/sqlstats/cluster_settings.go
+++ b/pkg/sql/sqlstats/cluster_settings.go
@@ -137,22 +137,12 @@ var MaxSQLStatsStmtFingerprintsPerExplicitTxn = settings.RegisterIntSetting(
 	2000,
 )
 
-// SQLStatReset is the cluster setting that controls at what interval SQL
-// statement statistics should be reset.
-var SQLStatReset = settings.RegisterDurationSetting(
-	"diagnostics.sql_stat_reset.interval",
-	"interval controlling how often SQL statement statistics should "+
-		"be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.",
-	time.Hour,
-	settings.NonNegativeDurationWithMaximum(time.Hour*24),
-).WithPublic()
-
 // MaxSQLStatReset is the cluster setting that controls at what interval SQL
 // statement statistics must be flushed within.
 var MaxSQLStatReset = settings.RegisterDurationSetting(
 	"diagnostics.forced_sql_stat_reset.interval",
-	"interval after which SQL statement statistics are refreshed even "+
-		"if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.",
-	time.Hour*2, // 2 x diagnostics.sql_stat_reset.interval
+	"interval after which the reported SQL Stats are reset even "+
+		"if not collected by telemetry reporter. It has a max value of 24H.",
+	time.Hour*2,
 	settings.NonNegativeDurationWithMaximum(time.Hour*24),
 ).WithPublic()
diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go
index 9e9f8ad7285e..911d00b6c8ed 100644
--- a/pkg/sql/sqlstats/persistedsqlstats/provider.go
+++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go
@@ -110,7 +110,6 @@ func New(cfg *Config, memSQLStats *sslocal.SQLStats) *PersistedSQLStats {
 
 // Start implements sqlstats.Provider interface.
 func (s *PersistedSQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
-	s.SQLStats.Start(ctx, stopper)
 	s.startSQLStatsFlushLoop(ctx, stopper)
 	s.jobMonitor.start(ctx, stopper)
 }
diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go
index f609c8a4e559..be91205b8ce2 100644
--- a/pkg/sql/sqlstats/sslocal/sql_stats.go
+++ b/pkg/sql/sqlstats/sslocal/sql_stats.go
@@ -38,10 +38,6 @@ type SQLStats struct {
 	// fingerprints we can store in memory.
 	uniqueTxnFingerprintLimit *settings.IntSetting
 
-	// resetInterval is the interval of how long before the in-memory stats are
-	// being reset.
-	resetInterval *settings.DurationSetting
-
 	mu struct {
 		syncutil.Mutex
 
@@ -78,7 +74,6 @@ func newSQLStats(
 	curMemBytesCount *metric.Gauge,
 	maxMemBytesHist *metric.Histogram,
 	parentMon *mon.BytesMonitor,
-	resetInterval *settings.DurationSetting,
 	flushTarget Sink,
 	knobs *sqlstats.TestingKnobs,
 ) *SQLStats {
@@ -95,7 +90,6 @@ func newSQLStats(
 		st:                         st,
 		uniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit,
 		uniqueTxnFingerprintLimit:  uniqueTxnFingerprintLimit,
-		resetInterval:              resetInterval,
 		flushTarget:                flushTarget,
 		knobs:                      knobs,
 	}
diff --git a/pkg/sql/sqlstats/sslocal/sql_stats_test.go b/pkg/sql/sqlstats/sslocal/sql_stats_test.go
index 20525dc7dc2d..3ed7a0d390a6 100644
--- a/pkg/sql/sqlstats/sslocal/sql_stats_test.go
+++ b/pkg/sql/sqlstats/sslocal/sql_stats_test.go
@@ -437,7 +437,6 @@ func TestExplicitTxnFingerprintAccounting(t *testing.T) {
 		nil, /* curMemoryBytesCount */
 		nil, /* maxMemoryBytesHist */
 		monitor,
-		nil, /* resetInterval */
 		nil, /* reportingSink */
 		nil, /* knobs */
 	)
diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go
index 691ed72982fc..c79d7314b88d 100644
--- a/pkg/sql/sqlstats/sslocal/sslocal_provider.go
+++ b/pkg/sql/sqlstats/sslocal/sslocal_provider.go
@@ -38,12 +38,11 @@ func New(
 	curMemoryBytesCount *metric.Gauge,
 	maxMemoryBytesHist *metric.Histogram,
 	pool *mon.BytesMonitor,
-	resetInterval *settings.DurationSetting,
 	reportingSink Sink,
 	knobs *sqlstats.TestingKnobs,
 ) *SQLStats {
 	return newSQLStats(settings, maxStmtFingerprints, maxTxnFingerprints,
-		curMemoryBytesCount, maxMemoryBytesHist, pool, resetInterval,
+		curMemoryBytesCount, maxMemoryBytesHist, pool,
 		reportingSink, knobs)
 }
 
@@ -59,18 +58,6 @@ func (s *SQLStats) GetController(
 
 // Start implements sqlstats.Provider interface.
 func (s *SQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
-	if s.resetInterval != nil {
-		s.periodicallyClearSQLStats(ctx, stopper, s.resetInterval)
-	}
-	// Start a loop to clear SQL stats at the max reset interval. This is
-	// to ensure that we always have some worker clearing SQL stats to avoid
-	// continually allocating space for the SQL stats.
-	s.periodicallyClearSQLStats(ctx, stopper, sqlstats.MaxSQLStatReset)
-}
-
-func (s *SQLStats) periodicallyClearSQLStats(
-	ctx context.Context, stopper *stop.Stopper, resetInterval *settings.DurationSetting,
-) {
 	// We run a periodic async job to clean up the in-memory stats.
 	_ = stopper.RunAsyncTask(ctx, "sql-stats-clearer", func(ctx context.Context) {
 		var timer timeutil.Timer
@@ -79,13 +66,13 @@ func (s *SQLStats) periodicallyClearSQLStats(
 			last := s.mu.lastReset
 			s.mu.Unlock()
 
-			next := last.Add(resetInterval.Get(&s.st.SV))
+			next := last.Add(sqlstats.MaxSQLStatReset.Get(&s.st.SV))
 			wait := next.Sub(timeutil.Now())
 			if wait < 0 {
 				err := s.Reset(ctx)
 				if err != nil {
 					if log.V(1) {
-						log.Warningf(ctx, "reported SQL stats memory limit has been exceeded, some fingerprints stats are discarded: %s", err)
+						log.Warningf(ctx, "unexpected error: %s", err)
 					}
 				}
 			} else {