Skip to content

Commit

Permalink
pkg/sql/sqlstats: counter metric for ignore flush done signals
Browse files Browse the repository at this point in the history
Addresses: cockroachdb#119779

We've had escalations recently involving the SQL activity update job
running for extended periods of time, such that the signal made to the
job indicating a flush has completed was not received because there was
no listener.

While we've added a default case to prevent this from hanging the flush
job, and some logging to go with it, a counter metric indicating when
this occurs would also be useful to have when debugging.

This patch adds such a counter.

Release note (ops change): A new counter metric,
`sql.stats.flush.done_signals.ignored`, has been introduced. The metric
tracks the number of times the SQL Stats activity update job ignored
the signal sent to it indicating a flush has completed. This may
indicate that the SQL Activity update job is taking longer than expected
to complete.
  • Loading branch information
abarganier committed Mar 18, 2024
1 parent d4fdd3f commit 2c4078e
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 15 deletions.
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1469,6 +1469,7 @@
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.done_signals.ignored</td><td>Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed</td><td>flush done signals ignored</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.duration</td><td>Time took to in nanoseconds to complete SQL Stats flush</td><td>SQL Stats Flush</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.error</td><td>Number of errors encountered when flushing SQL Stats</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.current</td><td>Current memory usage for fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
Expand Down
19 changes: 11 additions & 8 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,13 +484,14 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
DB: NewInternalDB(
s, MemoryMetrics{}, sqlStatsInternalExecutorMonitor,
),
ClusterID: s.cfg.NodeInfo.LogicalClusterID,
SQLIDContainer: cfg.NodeInfo.NodeID,
JobRegistry: s.cfg.JobRegistry,
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
ClusterID: s.cfg.NodeInfo.LogicalClusterID,
SQLIDContainer: cfg.NodeInfo.NodeID,
JobRegistry: s.cfg.JobRegistry,
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
}, memSQLStats)

s.sqlStats = persistedSQLStats
Expand Down Expand Up @@ -588,7 +589,9 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes),
DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats),
SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted),
SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),

SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: MetaSQLStatsFlushDuration,
Expand Down
9 changes: 9 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
io_prometheus_client "github.com/prometheus/client_model/go"
)

func init() {
Expand Down Expand Up @@ -1093,6 +1094,14 @@ var (
Measurement: "SQL Stats Flush",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{
Name: "sql.stats.flush.done_signals.ignored",
Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " +
"a flush has completed",
Measurement: "flush done signals ignored",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}
MetaSQLStatsFlushFailure = metric.Metadata{
Name: "sql.stats.flush.error",
Help: "Number of errors encountered when flushing SQL Stats",
Expand Down
9 changes: 5 additions & 4 deletions pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,11 @@ type StatsMetrics struct {

DiscardedStatsCount *metric.Counter

SQLStatsFlushStarted *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsRemovedRows *metric.Counter
SQLStatsFlushStarted *metric.Counter
SQLStatsFlushDoneSignalsIgnored *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsRemovedRows *metric.Counter

SQLTxnStatsCollectionOverhead metric.IHistogram
}
Expand Down
8 changes: 5 additions & 3 deletions pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ type Config struct {
JobRegistry *jobs.Registry

// Metrics.
FlushCounter *metric.Counter
FlushDuration metric.IHistogram
FailureCounter *metric.Counter
FlushCounter *metric.Counter
FlushDuration metric.IHistogram
FlushDoneSignalsIgnored *metric.Counter
FailureCounter *metric.Counter

// Testing knobs.
Knobs *sqlstats.TestingKnobs
Expand Down Expand Up @@ -211,6 +212,7 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper
// Don't block the flush loop if the sql activity update job is not
// ready to receive. We should at least continue to collect and flush
// stats for this node.
s.cfg.FlushDoneSignalsIgnored.Inc(1)
log.Warning(ctx, "sql-stats-worker: unable to signal flush completion")
}
}
Expand Down

0 comments on commit 2c4078e

Please sign in to comment.