Skip to content

Commit

Permalink
sql: remove diagnostics.sql_stat_reset.interval
Browse files Browse the repository at this point in the history
Previously, SQL Stats resets its in-memory SQL Stats
periodically (determined by diagnostics.sql_stat_reset.interval
cluster setting). The periodic reset loop was introduced early
in 21.1 prior to persisted SQL Stats. In 21.2, we introduced
persisted SQL Stats and also runs perodic task that flushes
in-memory SQL Stats into system tables. This two async tasks'
coexistence introduced unwanted race conditions. If the SQL
Stats reset task resets the in-memory store before the
flusher can flush it to system table, then that particular stats
would be lost forever.
This commit change the .Start() method in PersistedSQLStats to
avoid this race condition. However, reported SQL Stats still
periodically resets its in-memory storage in case the telemetry
server goes offline.

Release note (sql change): diagnostics.sql_stats_reset.interval
is removed.
  • Loading branch information
Azhng committed Oct 15, 2021
1 parent 0984f87 commit d337fdd
Show file tree
Hide file tree
Showing 10 changed files with 9 additions and 44 deletions.
1 change: 0 additions & 1 deletion docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ cluster.preserve_downgrade_option string disable (automatic or manual) cluster
diagnostics.forced_sql_stat_reset.interval duration 2h0m0s interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.
diagnostics.reporting.enabled boolean true enable reporting diagnostic metrics to cockroach labs
diagnostics.reporting.interval duration 1h0m0s interval at which diagnostics data should be reported
diagnostics.sql_stat_reset.interval duration 1h0m0s interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.
enterprise.license string the encoded cluster license
external.graphite.endpoint string if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port
external.graphite.interval duration 10s the interval at which metrics are pushed to Graphite (if enabled)
Expand Down
1 change: 0 additions & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
<tr><td><code>diagnostics.forced_sql_stat_reset.interval</code></td><td>duration</td><td><code>2h0m0s</code></td><td>interval after which SQL statement statistics are refreshed even if not collected (should be more than diagnostics.sql_stat_reset.interval). It has a max value of 24H.</td></tr>
<tr><td><code>diagnostics.reporting.enabled</code></td><td>boolean</td><td><code>true</code></td><td>enable reporting diagnostic metrics to cockroach labs</td></tr>
<tr><td><code>diagnostics.reporting.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval at which diagnostics data should be reported</td></tr>
<tr><td><code>diagnostics.sql_stat_reset.interval</code></td><td>duration</td><td><code>1h0m0s</code></td><td>interval controlling how often SQL statement statistics should be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.</td></tr>
<tr><td><code>enterprise.license</code></td><td>string</td><td><code></code></td><td>the encoded cluster license</td></tr>
<tr><td><code>external.graphite.endpoint</code></td><td>string</td><td><code></code></td><td>if nonempty, push server metrics to the Graphite or Carbon server at the specified host:port</td></tr>
<tr><td><code>external.graphite.interval</code></td><td>duration</td><td><code>10s</code></td><td>the interval at which metrics are pushed to Graphite (if enabled)</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion pkg/server/stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func TestEnsureSQLStatsAreFlushedForTelemetry(t *testing.T) {

statusServer := s.(*TestServer).status
sqlServer := s.(*TestServer).Server.sqlServer.pgServer.SQLServer
sqlServer.GetSQLStatsController().ResetLocalSQLStats(ctx)
sqlServer.GetSQLStatsProvider().(*persistedsqlstats.PersistedSQLStats).Flush(ctx)
testutils.SucceedsSoon(t, func() error {
// Get the diagnostic info.
res, err := statusServer.Diagnostics(ctx, &serverpb.DiagnosticsRequest{NodeId: "local"})
Expand Down
1 change: 1 addition & 0 deletions pkg/settings/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ var retiredSettings = map[string]struct{}{
"kv.closed_timestamp.close_fraction": {},
"sql.telemetry.query_sampling.qps_threshold": {},
"sql.telemetry.query_sampling.sample_rate": {},
"diagnostics.sql_stat_reset.interval": {},

// removed as of 22.1.
"sql.defaults.drop_enum_value.enabled": {},
Expand Down
11 changes: 4 additions & 7 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
metrics.StatsMetrics.ReportedSQLStatsMemoryCurBytesCount,
metrics.StatsMetrics.ReportedSQLStatsMemoryMaxBytesHist,
pool,
nil, /* resetInterval */
nil, /* reportedProvider */
cfg.SQLStatsTestingKnobs,
)
Expand All @@ -338,7 +337,6 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
metrics.StatsMetrics.SQLStatsMemoryCurBytesCount,
metrics.StatsMetrics.SQLStatsMemoryMaxBytesHist,
pool,
sqlstats.SQLStatReset,
reportedSQLStats,
cfg.SQLStatsTestingKnobs,
)
Expand Down Expand Up @@ -444,12 +442,11 @@ func makeMetrics(cfg *ExecutorConfig, internal bool) Metrics {
// Start starts the Server's background processing.
func (s *Server) Start(ctx context.Context, stopper *stop.Stopper) {
s.indexUsageStats.Start(ctx, stopper)
// Start a loop to clear SQL stats at the max reset interval. This is
// to ensure that we always have some worker clearing SQL stats to avoid
// continually allocating space for the SQL stats. Additionally, spawn
// a loop to clear the reported stats at the same large interval just
// in case the telemetry worker fails.
s.sqlStats.Start(ctx, stopper)

// reportedStats is periodically cleared to prevent too much SQL Stats
// accumulated in the reporter when the telemetry server fails.
// Usually it is telemetry's reporter's job to clear the reporting SQL Stats.
s.reportedStats.Start(ctx, stopper)
}

Expand Down
10 changes: 0 additions & 10 deletions pkg/sql/sqlstats/cluster_settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,6 @@ var MaxSQLStatsStmtFingerprintsPerExplicitTxn = settings.RegisterIntSetting(
2000,
)

// SQLStatReset is the cluster setting that controls at what interval SQL
// statement statistics should be reset.
var SQLStatReset = settings.RegisterDurationSetting(
"diagnostics.sql_stat_reset.interval",
"interval controlling how often SQL statement statistics should "+
"be reset (should be less than diagnostics.forced_sql_stat_reset.interval). It has a max value of 24H.",
time.Hour,
settings.NonNegativeDurationWithMaximum(time.Hour*24),
).WithPublic()

// MaxSQLStatReset is the cluster setting that controls at what interval SQL
// statement statistics must be flushed within.
var MaxSQLStatReset = settings.RegisterDurationSetting(
Expand Down
1 change: 0 additions & 1 deletion pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ func New(cfg *Config, memSQLStats *sslocal.SQLStats) *PersistedSQLStats {

// Start implements sqlstats.Provider interface.
func (s *PersistedSQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
s.SQLStats.Start(ctx, stopper)
s.startSQLStatsFlushLoop(ctx, stopper)
s.jobMonitor.start(ctx, stopper)
}
Expand Down
6 changes: 0 additions & 6 deletions pkg/sql/sqlstats/sslocal/sql_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ type SQLStats struct {
// fingerprints we can store in memory.
uniqueTxnFingerprintLimit *settings.IntSetting

// resetInterval is the interval of how long before the in-memory stats are
// being reset.
resetInterval *settings.DurationSetting

mu struct {
syncutil.Mutex

Expand Down Expand Up @@ -78,7 +74,6 @@ func newSQLStats(
curMemBytesCount *metric.Gauge,
maxMemBytesHist *metric.Histogram,
parentMon *mon.BytesMonitor,
resetInterval *settings.DurationSetting,
flushTarget Sink,
knobs *sqlstats.TestingKnobs,
) *SQLStats {
Expand All @@ -95,7 +90,6 @@ func newSQLStats(
st: st,
uniqueStmtFingerprintLimit: uniqueStmtFingerprintLimit,
uniqueTxnFingerprintLimit: uniqueTxnFingerprintLimit,
resetInterval: resetInterval,
flushTarget: flushTarget,
knobs: knobs,
}
Expand Down
1 change: 0 additions & 1 deletion pkg/sql/sqlstats/sslocal/sql_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,6 @@ func TestExplicitTxnFingerprintAccounting(t *testing.T) {
nil, /* curMemoryBytesCount */
nil, /* maxMemoryBytesHist */
monitor,
nil, /* resetInterval */
nil, /* reportingSink */
nil, /* knobs */
)
Expand Down
19 changes: 3 additions & 16 deletions pkg/sql/sqlstats/sslocal/sslocal_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,11 @@ func New(
curMemoryBytesCount *metric.Gauge,
maxMemoryBytesHist *metric.Histogram,
pool *mon.BytesMonitor,
resetInterval *settings.DurationSetting,
reportingSink Sink,
knobs *sqlstats.TestingKnobs,
) *SQLStats {
return newSQLStats(settings, maxStmtFingerprints, maxTxnFingerprints,
curMemoryBytesCount, maxMemoryBytesHist, pool, resetInterval,
curMemoryBytesCount, maxMemoryBytesHist, pool,
reportingSink, knobs)
}

Expand All @@ -59,18 +58,6 @@ func (s *SQLStats) GetController(

// Start implements sqlstats.Provider interface.
func (s *SQLStats) Start(ctx context.Context, stopper *stop.Stopper) {
if s.resetInterval != nil {
s.periodicallyClearSQLStats(ctx, stopper, s.resetInterval)
}
// Start a loop to clear SQL stats at the max reset interval. This is
// to ensure that we always have some worker clearing SQL stats to avoid
// continually allocating space for the SQL stats.
s.periodicallyClearSQLStats(ctx, stopper, sqlstats.MaxSQLStatReset)
}

func (s *SQLStats) periodicallyClearSQLStats(
ctx context.Context, stopper *stop.Stopper, resetInterval *settings.DurationSetting,
) {
// We run a periodic async job to clean up the in-memory stats.
_ = stopper.RunAsyncTask(ctx, "sql-stats-clearer", func(ctx context.Context) {
var timer timeutil.Timer
Expand All @@ -79,13 +66,13 @@ func (s *SQLStats) periodicallyClearSQLStats(
last := s.mu.lastReset
s.mu.Unlock()

next := last.Add(resetInterval.Get(&s.st.SV))
next := last.Add(sqlstats.MaxSQLStatReset.Get(&s.st.SV))
wait := next.Sub(timeutil.Now())
if wait < 0 {
err := s.Reset(ctx)
if err != nil {
if log.V(1) {
log.Warningf(ctx, "reported SQL stats memory limit has been exceeded, some fingerprints stats are discarded: %s", err)
log.Warningf(ctx, "unexpected error: %s", err)
}
}
} else {
Expand Down

0 comments on commit d337fdd

Please sign in to comment.