sql: remove diagnostics.sql_stat_reset.interval

Previously, the SQL Stats system reset its in-memory SQL Stats periodically, determined by the diagnostics.sql_stat_reset.interval cluster setting. This periodic reset loop was introduced early in 21.1 prior to persisted SQL Stats. In 21.2, we introduced persisted SQL Stats and along with a perodic task to flush in-memory SQL Stats into system tables. These two async tasks' coexistence introduced unwanted race conditions. If the SQL Stats reset task reset the in-memory store before the flusher could flush it to the system table, those particular stats would be lost forever. This commit changes the .Start() method in PersistedSQLStats to avoid this race condition. However, reported SQL Stats still periodically resets its in-memory storage in case the telemetry server goes offline. As the result, diagnostics.sql_stats_reset.interval cluster setting is removed. diagnostics.forced_sql_stats_reset.interval cluster setting now only controls the reset of the *reported* SQL Stats if it is not collected by the telemetry reporter. Release note (sql change): Previously, in-memory SQL Stats was reset periodically. This behavior is now removed since persisted SQL Stats is introduced in 21.2. As the result, diagnostics.sql_stats_reset.interval cluster setting is removed. diagnostics.forced_sql_stats_reset.interval now only controls the reset of the reported SQL Stats if it is not collected by the telemetry reporter.
cockroachdb · Nov 9, 2021 · af1866e · af1866e
1 parent 6c996e3
commit af1866e
Show file tree

Hide file tree

Showing 10 changed files with 14 additions and 49 deletions.
diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
@@ -11,10 +11,9 @@ cloudstorage.http.custom_ca	string		custom root CA (appended to system's default
 cloudstorage.timeout	duration	10m0s	the timeout for import/export storage operations
 cluster.organization	string		organization name
 cluster.preserve_downgrade_option	string		disable (automatic or manual) cluster version upgrade from the specified version until reset
-diagnostics.forced_sql_stat_reset.interval	duration	2h0m0s	interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.
+diagnostics.forced_sql_stat_reset.interval	duration	2h0m0s	interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H.
 diagnostics.reporting.enabled	boolean	true	enable reporting diagnostic metrics to cockroach labs
 diagnostics.reporting.interval	duration	1h0m0s	interval at which diagnostics data should be reported
-diagnostics.sql_stat_reset.interval	duration	1h0m0s	interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.
 enterprise.license	string		the encoded cluster license
 external.graphite.endpoint	string		if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port
 external.graphite.interval	duration	10s	the interval at which metrics are pushed to Graphite (if enabled)

diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -14,10 +14,9 @@
 <tr><td><code>cluster.organization</code></td><td>string</td><td><code></code></td><td>organization name</td></tr>
 <tr><td><code>cluster.preserve_downgrade_option</code></td><td>string</td><td><code></code></td><td>disable (automatic or manual) cluster version upgrade from the specified version until reset</td></tr>
 <tr><td><code>diagnostics.active_query_dumps.enabled</code></td><td>boolean</td><td><code>true</code></td><td>experimental: enable dumping of anonymized active queries to disk when node is under memory pressure</td></tr>
-<tr><td><code>diagnostics.forced_sql_stat_reset.interval</code></td><td>duration</td><td><code>2h0m0s</code></td><td>interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.</td></tr>
+<tr><td><code>diagnostics.forced_sql_stat_reset.interval</code></td><td>duration</td><td><code>2h0m0s</code></td><td>interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H.</td></tr>
 <tr><td><code>diagnostics.reporting.enabled</code></td><td>boolean</td><td><code>true</code></td><td>enable reporting diagnostic metrics to cockroach labs</td></tr>
 <tr><td><code>diagnostics.reporting.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval at which diagnostics data should be reported</td></tr>
-<tr><td><code>diagnostics.sql_stat_reset.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.</td></tr>
 <tr><td><code>enterprise.license</code></td><td>string</td><td><code></code></td><td>the encoded cluster license</td></tr>
 <tr><td><code>external.graphite.endpoint</code></td><td>string</td><td><code></code></td><td>if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port</td></tr>
 <tr><td><code>external.graphite.interval</code></td><td>duration</td><td><code>10s</code></td><td>the interval at which metrics are pushed to Graphite (if enabled)</td></tr>

diff --git a/pkg/server/stats_test.go b/pkg/server/stats_test.go
@@ -140,7 +140,7 @@ func TestEnsureSQLStatsAreFlushedForTelemetry(t *testing.T) {
 
 	statusServer := s.(*TestServer).status
 	sqlServer := s.(*TestServer).Server.sqlServer.pgServer.SQLServer
-	sqlServer.GetSQLStatsController().ResetLocalSQLStats(ctx)
+	sqlServer.GetSQLStatsProvider().(*persistedsqlstats.PersistedSQLStats).Flush(ctx)
 	testutils.SucceedsSoon(t, func() error {
 		// Get the diagnostic info.
 		res, err := statusServer.Diagnostics(ctx, &serverpb.DiagnosticsRequest{NodeId: "local"})

diff --git a/pkg/settings/registry.go b/pkg/settings/registry.go
@@ -103,6 +103,7 @@ var retiredSettings = map[string]struct{}{
 	"kv.closed_timestamp.close_fraction":                             {},
 	"sql.telemetry.query_sampling.qps_threshold":                     {},
 	"sql.telemetry.query_sampling.sample_rate":                       {},
+	"diagnostics.sql_stat_reset.interval":                            {},
 
 	// removed as of 22.1.
 	"sql.defaults.drop_enum_value.enabled":                             {},

diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go
@@ -326,7 +326,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
 		metrics.StatsMetrics.ReportedSQLStatsMemoryCurBytesCount,
 		metrics.StatsMetrics.ReportedSQLStatsMemoryMaxBytesHist,
 		pool,
-		nil, /* resetInterval */
 		nil, /* reportedProvider */
 		cfg.SQLStatsTestingKnobs,
 	)
@@ -339,7 +338,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
 		metrics.StatsMetrics.SQLStatsMemoryCurBytesCount,
 		metrics.StatsMetrics.SQLStatsMemoryMaxBytesHist,
 		pool,
-		sqlstats.SQLStatReset,
 		reportedSQLStats,
 		cfg.SQLStatsTestingKnobs,
 	)
@@ -446,12 +444,11 @@ func makeMetrics(cfg *ExecutorConfig, internal bool) Metrics {
 // Start starts the Server's background processing.
 func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) {
 	s.indexUsageStats.Start(ctx, stopper)
-	// Start a loop to clear SQL stats at the max reset interval. This is
-	// to ensure that we always have some worker clearing SQL stats to avoid
-	// continually allocating space for the SQL stats. Additionally, spawn
-	// a loop to clear the reported stats at the same large interval just
-	// in case the telemetry worker fails.
 	s.sqlStats.Start(ctx, stopper)
+
+	// reportedStats is periodically cleared to prevent too many SQL Stats
+	// accumulated in the reporter when the telemetry server fails.
+	// Usually it is telemetry's reporter's job to clear the reporting SQL Stats.
 	s.reportedStats.Start(ctx, stopper)
 }
 

diff --git a/pkg/sql/sqlstats/cluster_settings.go b/pkg/sql/sqlstats/cluster_settings.go
@@ -137,22 +137,12 @@ var MaxSQLStatsStmtFingerprintsPerExplicitTxn = settings.RegisterIntSetting(
 	2000,
 )
 
-// SQLStatReset is the cluster setting that controls at what interval SQL
-// statement statistics should be reset.
-var SQLStatReset = settings.RegisterDurationSetting(
-	"diagnostics.sql_stat_reset.interval",
-	"interval controlling how often SQL statement statistics should "+
-		"be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.",
-	time.Hour,
-	settings.NonNegativeDurationWithMaximum(time.Hour*24),
-).WithPublic()
-
 // MaxSQLStatReset is the cluster setting that controls at what interval SQL
 // statement statistics must be flushed within.
 var MaxSQLStatReset = settings.RegisterDurationSetting(
 	"diagnostics.forced_sql_stat_reset.interval",
-	"interval after which SQL statement statistics are refreshed even "+
-		"if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.",
-	time.Hour*2, // 2 x diagnostics.sql_stat_reset.interval
+	"interval after which the reported SQL Stats are reset even "+
+		"if not collected by telemetry reporter. It has a max value of 24H.",
+	time.Hour*2,
 	settings.NonNegativeDurationWithMaximum(time.Hour*24),
 ).WithPublic()
diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go
@@ -110,7 +110,6 @@ func New(cfg *Config, memSQLStats *sslocal.SQLStats) *PersistedSQLStats {
 
 // Start implements sqlstats.Provider interface.
 func (s *PersistedSQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
-	s.SQLStats.Start(ctx, stopper)
 	s.startSQLStatsFlushLoop(ctx, stopper)
 	s.jobMonitor.start(ctx, stopper)
 }

diff --git a/pkg/sql/sqlstats/sslocal/sql_stats.go b/pkg/sql/sqlstats/sslocal/sql_stats.go
@@ -38,10 +38,6 @@ type SQLStats struct {
 	// fingerprints we can store in memory.
 	uniqueTxnFingerprintLimit *settings.IntSetting
 
-	// resetInterval is the interval of how long before the in-memory stats are
-	// being reset.
-	resetInterval *settings.DurationSetting
-
 	mu struct {
 		syncutil.Mutex
 
@@ -78,7 +74,6 @@ func newSQLStats(
 	curMemBytesCount *metric.Gauge,
 	maxMemBytesHist *metric.Histogram,
 	parentMon *mon.BytesMonitor,
-	resetInterval *settings.DurationSetting,
 	flushTarget Sink,
 	knobs *sqlstats.TestingKnobs,
 ) *SQLStats {
@@ -95,7 +90,6 @@ func newSQLStats(
 		st:                         st,
 		uniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit,
 		uniqueTxnFingerprintLimit:  uniqueTxnFingerprintLimit,
-		resetInterval:              resetInterval,
 		flushTarget:                flushTarget,
 		knobs:                      knobs,
 	}

diff --git a/pkg/sql/sqlstats/sslocal/sql_stats_test.go b/pkg/sql/sqlstats/sslocal/sql_stats_test.go
@@ -437,7 +437,6 @@ func TestExplicitTxnFingerprintAccounting(t *testing.T) {
 		nil, /* curMemoryBytesCount */
 		nil, /* maxMemoryBytesHist */
 		monitor,
-		nil, /* resetInterval */
 		nil, /* reportingSink */
 		nil, /* knobs */
 	)

diff --git a/pkg/sql/sqlstats/sslocal/sslocal_provider.go b/pkg/sql/sqlstats/sslocal/sslocal_provider.go
@@ -38,12 +38,11 @@ func New(
 	curMemoryBytesCount *metric.Gauge,
 	maxMemoryBytesHist *metric.Histogram,
 	pool *mon.BytesMonitor,
-	resetInterval *settings.DurationSetting,
 	reportingSink Sink,
 	knobs *sqlstats.TestingKnobs,
 ) *SQLStats {
 	return newSQLStats(settings, maxStmtFingerprints, maxTxnFingerprints,
-		curMemoryBytesCount, maxMemoryBytesHist, pool, resetInterval,
+		curMemoryBytesCount, maxMemoryBytesHist, pool,
 		reportingSink, knobs)
 }
 
@@ -59,18 +58,6 @@ func (s *SQLStats) GetController(
 
 // Start implements sqlstats.Provider interface.
 func (s *SQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
-	if s.resetInterval != nil {
-		s.periodicallyClearSQLStats(ctx, stopper, s.resetInterval)
-	}
-	// Start a loop to clear SQL stats at the max reset interval. This is
-	// to ensure that we always have some worker clearing SQL stats to avoid
-	// continually allocating space for the SQL stats.
-	s.periodicallyClearSQLStats(ctx, stopper, sqlstats.MaxSQLStatReset)
-}
-
-func (s *SQLStats) periodicallyClearSQLStats(
-	ctx context.Context, stopper *stop.Stopper, resetInterval *settings.DurationSetting,
-) {
 	// We run a periodic async job to clean up the in-memory stats.
 	_ = stopper.RunAsyncTask(ctx, "sql-stats-clearer", func(ctx context.Context) {
 		var timer timeutil.Timer
@@ -79,13 +66,13 @@ func (s *SQLStats) periodicallyClearSQLStats(
 			last := s.mu.lastReset
 			s.mu.Unlock()
 
-			next := last.Add(resetInterval.Get(&s.st.SV))
+			next := last.Add(sqlstats.MaxSQLStatReset.Get(&s.st.SV))
 			wait := next.Sub(timeutil.Now())
 			if wait < 0 {
 				err := s.Reset(ctx)
 				if err != nil {
 					if log.V(1) {
-						log.Warningf(ctx, "reported SQL stats memory limit has been exceeded, some fingerprints stats are discarded: %s", err)
+						log.Warningf(ctx, "unexpected error: %s", err)
 					}
 				}
 			} else {