diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
index ca5dff454aa0..86008a8bd984 100644
--- a/docs/generated/metrics/metrics.html
+++ b/docs/generated/metrics/metrics.html
@@ -1469,6 +1469,7 @@
APPLICATION | sql.stats.cleanup.rows_removed | Number of stale statistics rows that are removed | SQL Stats Cleanup | COUNTER | COUNT | AVG | NON_NEGATIVE_DERIVATIVE |
APPLICATION | sql.stats.discarded.current | Number of fingerprint statistics being discarded | Discarded SQL Stats | COUNTER | COUNT | AVG | NON_NEGATIVE_DERIVATIVE |
APPLICATION | sql.stats.flush.count | Number of times SQL Stats are flushed to persistent storage | SQL Stats Flush | COUNTER | COUNT | AVG | NON_NEGATIVE_DERIVATIVE |
+APPLICATION | sql.stats.flush.done_signals.ignored | Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed | flush done signals ignored | COUNTER | COUNT | AVG | NON_NEGATIVE_DERIVATIVE |
APPLICATION | sql.stats.flush.duration | Time took to in nanoseconds to complete SQL Stats flush | SQL Stats Flush | HISTOGRAM | NANOSECONDS | AVG | NONE |
APPLICATION | sql.stats.flush.error | Number of errors encountered when flushing SQL Stats | SQL Stats Flush | COUNTER | COUNT | AVG | NON_NEGATIVE_DERIVATIVE |
APPLICATION | sql.stats.mem.current | Current memory usage for fingerprint storage | Memory | GAUGE | BYTES | AVG | NONE |
diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go
index 35a661d5a526..f3128abd4872 100644
--- a/pkg/sql/conn_executor.go
+++ b/pkg/sql/conn_executor.go
@@ -484,13 +484,14 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
DB: NewInternalDB(
s, MemoryMetrics{}, sqlStatsInternalExecutorMonitor,
),
- ClusterID: s.cfg.NodeInfo.LogicalClusterID,
- SQLIDContainer: cfg.NodeInfo.NodeID,
- JobRegistry: s.cfg.JobRegistry,
- Knobs: cfg.SQLStatsTestingKnobs,
- FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
- FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
- FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
+ ClusterID: s.cfg.NodeInfo.LogicalClusterID,
+ SQLIDContainer: cfg.NodeInfo.NodeID,
+ JobRegistry: s.cfg.JobRegistry,
+ Knobs: cfg.SQLStatsTestingKnobs,
+ FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
+ FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
+ FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
+ FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
}, memSQLStats)
s.sqlStats = persistedSQLStats
@@ -588,7 +589,9 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes),
DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats),
SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted),
- SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
+ SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),
+
+ SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: MetaSQLStatsFlushDuration,
diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
index 7536ff992e33..84759a418956 100644
--- a/pkg/sql/exec_util.go
+++ b/pkg/sql/exec_util.go
@@ -125,6 +125,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
+ io_prometheus_client "github.com/prometheus/client_model/go"
)
func init() {
@@ -1093,6 +1094,14 @@ var (
Measurement: "SQL Stats Flush",
Unit: metric.Unit_COUNT,
}
+ MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{
+ Name: "sql.stats.flush.done_signals.ignored",
+ Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " +
+ "a flush has completed",
+ Measurement: "flush done signals ignored",
+ Unit: metric.Unit_COUNT,
+ MetricType: io_prometheus_client.MetricType_COUNTER,
+ }
MetaSQLStatsFlushFailure = metric.Metadata{
Name: "sql.stats.flush.error",
Help: "Number of errors encountered when flushing SQL Stats",
diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go
index 948dce70a590..cb7f8314c346 100644
--- a/pkg/sql/executor_statement_metrics.go
+++ b/pkg/sql/executor_statement_metrics.go
@@ -75,10 +75,11 @@ type StatsMetrics struct {
DiscardedStatsCount *metric.Counter
- SQLStatsFlushStarted *metric.Counter
- SQLStatsFlushFailure *metric.Counter
- SQLStatsFlushDuration metric.IHistogram
- SQLStatsRemovedRows *metric.Counter
+ SQLStatsFlushStarted *metric.Counter
+ SQLStatsFlushDoneSignalsIgnored *metric.Counter
+ SQLStatsFlushFailure *metric.Counter
+ SQLStatsFlushDuration metric.IHistogram
+ SQLStatsRemovedRows *metric.Counter
SQLTxnStatsCollectionOverhead metric.IHistogram
}
diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go
index e78b06c8bbb6..5f038501396b 100644
--- a/pkg/sql/sqlstats/persistedsqlstats/provider.go
+++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go
@@ -46,9 +46,10 @@ type Config struct {
JobRegistry *jobs.Registry
// Metrics.
- FlushCounter *metric.Counter
- FlushDuration metric.IHistogram
- FailureCounter *metric.Counter
+ FlushCounter *metric.Counter
+ FlushDuration metric.IHistogram
+ FlushDoneSignalsIgnored *metric.Counter
+ FailureCounter *metric.Counter
// Testing knobs.
Knobs *sqlstats.TestingKnobs
@@ -211,6 +212,7 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper
// Don't block the flush loop if the sql activity update job is not
// ready to receive. We should at least continue to collect and flush
// stats for this node.
+ s.cfg.FlushDoneSignalsIgnored.Inc(1)
log.Warning(ctx, "sql-stats-worker: unable to signal flush completion")
}
}