diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index ca5dff454aa0..86008a8bd984 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1469,6 +1469,7 @@ APPLICATIONsql.stats.cleanup.rows_removedNumber of stale statistics rows that are removedSQL Stats CleanupCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.discarded.currentNumber of fingerprint statistics being discardedDiscarded SQL StatsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.flush.countNumber of times SQL Stats are flushed to persistent storageSQL Stats FlushCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.stats.flush.done_signals.ignoredNumber of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completedflush done signals ignoredCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.flush.durationTime took to in nanoseconds to complete SQL Stats flushSQL Stats FlushHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.stats.flush.errorNumber of errors encountered when flushing SQL StatsSQL Stats FlushCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.mem.currentCurrent memory usage for fingerprint storageMemoryGAUGEBYTESAVGNONE diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 35a661d5a526..f3128abd4872 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -484,13 +484,14 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server { DB: NewInternalDB( s, MemoryMetrics{}, sqlStatsInternalExecutorMonitor, ), - ClusterID: s.cfg.NodeInfo.LogicalClusterID, - SQLIDContainer: cfg.NodeInfo.NodeID, - JobRegistry: s.cfg.JobRegistry, - Knobs: cfg.SQLStatsTestingKnobs, - FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted, - FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure, - FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration, + ClusterID: s.cfg.NodeInfo.LogicalClusterID, + SQLIDContainer: cfg.NodeInfo.NodeID, + JobRegistry: s.cfg.JobRegistry, + Knobs: cfg.SQLStatsTestingKnobs, + FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted, + FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored, + FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure, + FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration, }, memSQLStats) s.sqlStats = persistedSQLStats @@ -588,7 +589,9 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), - SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), + SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored), + + SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{ Mode: metric.HistogramModePreferHdrLatency, Metadata: MetaSQLStatsFlushDuration, diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 7536ff992e33..84759a418956 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -125,6 +125,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" "github.com/cockroachdb/redact" + io_prometheus_client "github.com/prometheus/client_model/go" ) func init() { @@ -1093,6 +1094,14 @@ var ( Measurement: "SQL Stats Flush", Unit: metric.Unit_COUNT, } + MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{ + Name: "sql.stats.flush.done_signals.ignored", + Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " + + "a flush has completed", + Measurement: "flush done signals ignored", + Unit: metric.Unit_COUNT, + MetricType: io_prometheus_client.MetricType_COUNTER, + } MetaSQLStatsFlushFailure = metric.Metadata{ Name: "sql.stats.flush.error", Help: "Number of errors encountered when flushing SQL Stats", diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index 948dce70a590..cb7f8314c346 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -75,10 +75,11 @@ type StatsMetrics struct { DiscardedStatsCount *metric.Counter - SQLStatsFlushStarted *metric.Counter - SQLStatsFlushFailure *metric.Counter - SQLStatsFlushDuration metric.IHistogram - SQLStatsRemovedRows *metric.Counter + SQLStatsFlushStarted *metric.Counter + SQLStatsFlushDoneSignalsIgnored *metric.Counter + SQLStatsFlushFailure *metric.Counter + SQLStatsFlushDuration metric.IHistogram + SQLStatsRemovedRows *metric.Counter SQLTxnStatsCollectionOverhead metric.IHistogram } diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index e78b06c8bbb6..5f038501396b 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -46,9 +46,10 @@ type Config struct { JobRegistry *jobs.Registry // Metrics. - FlushCounter *metric.Counter - FlushDuration metric.IHistogram - FailureCounter *metric.Counter + FlushCounter *metric.Counter + FlushDuration metric.IHistogram + FlushDoneSignalsIgnored *metric.Counter + FailureCounter *metric.Counter // Testing knobs. Knobs *sqlstats.TestingKnobs @@ -211,6 +212,7 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper // Don't block the flush loop if the sql activity update job is not // ready to receive. We should at least continue to collect and flush // stats for this node. + s.cfg.FlushDoneSignalsIgnored.Inc(1) log.Warning(ctx, "sql-stats-worker: unable to signal flush completion") } }