From 7ee54cb3b2f463ce72b1a48eeac6119f08873cb3 Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Tue, 19 Mar 2024 13:35:54 -0400 Subject: [PATCH 1/2] pkg/sql/sqlstats: rework SQL stats flush metrics Informs: https://github.com/cockroachdb/cockroach/issues/119779 This patch renames a few of the core metrics used to track the SQL stats flush job to be more in-line with how the SQL activity update job's metrics work. Primarily, it renames them for consistency. It also changes the general `count` metric to only track successful executions, so we can more easily discern between failed and successful executions. Release note (ops change): Metrics used to track the SQL stats subsystem's task that flushes in-memory stats to persisted storage have been reworked slightly to be more consistent with other metrics used in the subsystem. The metrics are: - `sql.stats.flushes.successful`: Number of times SQL Stats are flushed successfully to persistent storage - `sql.stats.flushes.failed`: Number of attempted SQL Stats flushes that failed with errors - `sql.stats.flush.latency`: The latency of SQL Stats flushes to persistent storage. Includes failed flush attempts --- docs/generated/metrics/metrics.html | 6 ++--- pkg/sql/conn_executor.go | 14 +++++------ pkg/sql/exec_util.go | 24 +++++++++---------- pkg/sql/executor_statement_metrics.go | 6 ++--- pkg/sql/sqlstats/persistedsqlstats/flush.go | 7 +++--- .../sqlstats/persistedsqlstats/provider.go | 6 ++--- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index c420e3e73441..7507941d10e2 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1478,11 +1478,11 @@ APPLICATIONsql.stats.activity.updates.successfulNumber of successful updates made by the SQL activity updater jobsuccessful updatesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.cleanup.rows_removedNumber of stale statistics rows that are removedSQL Stats CleanupCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.discarded.currentNumber of fingerprint statistics being discardedDiscarded SQL StatsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.stats.flush.countNumber of times SQL Stats are flushed to persistent storageSQL Stats FlushCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.flush.done_signals.ignoredNumber of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completedflush done signals ignoredCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.stats.flush.durationTime took to in nanoseconds to complete SQL Stats flushSQL Stats FlushHISTOGRAMNANOSECONDSAVGNONE -APPLICATIONsql.stats.flush.errorNumber of errors encountered when flushing SQL StatsSQL Stats FlushCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.flush.fingerprint.countThe number of unique statement and transaction fingerprints included in the SQL Stats flushstatement & transaction fingerprintsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.stats.flush.latencyThe latency of SQL Stats flushes to persistent storage. Includes failed flush attemptsnanosecondsHISTOGRAMNANOSECONDSAVGNONE +APPLICATIONsql.stats.flushes.failedNumber of attempted SQL Stats flushes that failed with errorsfailed flushesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.stats.flushes.successfulNumber of times SQL Stats are flushed successfully to persistent storagesuccessful flushesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.mem.currentCurrent memory usage for fingerprint storageMemoryGAUGEBYTESAVGNONE APPLICATIONsql.stats.mem.maxMemory usage for fingerprint storageMemoryHISTOGRAMBYTESAVGNONE APPLICATIONsql.stats.reported.mem.currentCurrent memory usage for reported fingerprint storageMemoryGAUGEBYTESAVGNONE diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index e3ee3063d2bc..864ad7eaab9b 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -488,11 +488,11 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server { SQLIDContainer: cfg.NodeInfo.NodeID, JobRegistry: s.cfg.JobRegistry, Knobs: cfg.SQLStatsTestingKnobs, - FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted, + FlushesSuccessful: serverMetrics.StatsMetrics.SQLStatsFlushesSuccessful, FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored, FlushedFingerprintCount: serverMetrics.StatsMetrics.SQLStatsFlushFingerprintCount, - FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure, - FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration, + FlushesFailed: serverMetrics.StatsMetrics.SQLStatsFlushesFailed, + FlushLatency: serverMetrics.StatsMetrics.SQLStatsFlushLatency, }, memSQLStats) s.sqlStats = persistedSQLStats @@ -589,14 +589,14 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics { }), ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes), DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats), - SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted), + SQLStatsFlushesSuccessful: metric.NewCounter(MetaSQLStatsFlushesSuccessful), SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored), SQLStatsFlushFingerprintCount: metric.NewCounter(MetaSQLStatsFlushFingerprintCount), - SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure), - SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{ + SQLStatsFlushesFailed: metric.NewCounter(MetaSQLStatsFlushesFailed), + SQLStatsFlushLatency: metric.NewHistogram(metric.HistogramOptions{ Mode: metric.HistogramModePreferHdrLatency, - Metadata: MetaSQLStatsFlushDuration, + Metadata: MetaSQLStatsFlushLatency, Duration: 6 * metricsSampleInterval, BucketConfig: metric.IOLatencyBuckets, }), diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 41e6835e2f05..0c106886a664 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -1088,10 +1088,10 @@ var ( Measurement: "Discarded SQL Stats", Unit: metric.Unit_COUNT, } - MetaSQLStatsFlushStarted = metric.Metadata{ - Name: "sql.stats.flush.count", - Help: "Number of times SQL Stats are flushed to persistent storage", - Measurement: "SQL Stats Flush", + MetaSQLStatsFlushesSuccessful = metric.Metadata{ + Name: "sql.stats.flushes.successful", + Help: "Number of times SQL Stats are flushed successfully to persistent storage", + Measurement: "successful flushes", Unit: metric.Unit_COUNT, } MetaSQLStatsFlushFingerprintCount = metric.Metadata{ @@ -1108,16 +1108,16 @@ var ( Unit: metric.Unit_COUNT, MetricType: io_prometheus_client.MetricType_COUNTER, } - MetaSQLStatsFlushFailure = metric.Metadata{ - Name: "sql.stats.flush.error", - Help: "Number of errors encountered when flushing SQL Stats", - Measurement: "SQL Stats Flush", + MetaSQLStatsFlushesFailed = metric.Metadata{ + Name: "sql.stats.flushes.failed", + Help: "Number of attempted SQL Stats flushes that failed with errors", + Measurement: "failed flushes", Unit: metric.Unit_COUNT, } - MetaSQLStatsFlushDuration = metric.Metadata{ - Name: "sql.stats.flush.duration", - Help: "Time took to in nanoseconds to complete SQL Stats flush", - Measurement: "SQL Stats Flush", + MetaSQLStatsFlushLatency = metric.Metadata{ + Name: "sql.stats.flush.latency", + Help: "The latency of SQL Stats flushes to persistent storage. Includes failed flush attempts", + Measurement: "nanoseconds", Unit: metric.Unit_NANOSECONDS, } MetaSQLStatsRemovedRows = metric.Metadata{ diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index 7a796e5dab4a..0f924f0a1a1b 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -75,11 +75,11 @@ type StatsMetrics struct { DiscardedStatsCount *metric.Counter - SQLStatsFlushStarted *metric.Counter + SQLStatsFlushesSuccessful *metric.Counter SQLStatsFlushDoneSignalsIgnored *metric.Counter SQLStatsFlushFingerprintCount *metric.Counter - SQLStatsFlushFailure *metric.Counter - SQLStatsFlushDuration metric.IHistogram + SQLStatsFlushesFailed *metric.Counter + SQLStatsFlushLatency metric.IHistogram SQLStatsRemovedRows *metric.Counter SQLTxnStatsCollectionOverhead metric.IHistogram diff --git a/pkg/sql/sqlstats/persistedsqlstats/flush.go b/pkg/sql/sqlstats/persistedsqlstats/flush.go index 81655ba4eb5a..b2b77af9da91 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/flush.go +++ b/pkg/sql/sqlstats/persistedsqlstats/flush.go @@ -168,12 +168,13 @@ func (s *PersistedSQLStats) doFlush(ctx context.Context, workFn func() error, er defer func() { if err != nil { - s.cfg.FailureCounter.Inc(1) + s.cfg.FlushesFailed.Inc(1) log.Warningf(ctx, "%s: %s", errMsg, err) + } else { + s.cfg.FlushesSuccessful.Inc(1) } flushDuration := s.getTimeNow().Sub(flushBegin) - s.cfg.FlushDuration.RecordValue(flushDuration.Nanoseconds()) - s.cfg.FlushCounter.Inc(1) + s.cfg.FlushLatency.RecordValue(flushDuration.Nanoseconds()) }() err = workFn() diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go index b06d467c3c96..0ab0f9d6d3f7 100644 --- a/pkg/sql/sqlstats/persistedsqlstats/provider.go +++ b/pkg/sql/sqlstats/persistedsqlstats/provider.go @@ -46,10 +46,10 @@ type Config struct { JobRegistry *jobs.Registry // Metrics. - FlushCounter *metric.Counter - FlushDuration metric.IHistogram + FlushesSuccessful *metric.Counter + FlushLatency metric.IHistogram FlushDoneSignalsIgnored *metric.Counter - FailureCounter *metric.Counter + FlushesFailed *metric.Counter FlushedFingerprintCount *metric.Counter // Testing knobs. From 78fee7a2755f0eb94d6512de271f3567e817d273 Mon Sep 17 00:00:00 2001 From: Alex Barganier Date: Tue, 19 Mar 2024 13:42:30 -0400 Subject: [PATCH 2/2] pkg/sql/sqlstats: fix typo in `sql.stats.activity.updates.failed` desc Fix a type in the measurement description for the `sql.stats.activity.updates.failed` metric. Release note: none --- docs/generated/metrics/metrics.html | 2 +- pkg/sql/sql_activity_update_job.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 7507941d10e2..534b50737af7 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1474,7 +1474,7 @@ APPLICATIONsql.statements.activeNumber of currently active user SQL statementsActive StatementsGAUGECOUNTAVGNONE APPLICATIONsql.statements.active.internalNumber of currently active user SQL statements (internal queries)SQL Internal StatementsGAUGECOUNTAVGNONE APPLICATIONsql.stats.activity.update.latencyThe latency of updates made by the SQL activity updater job. Includes failed update attemptsNanosecondsHISTOGRAMNANOSECONDSAVGNONE -APPLICATIONsql.stats.activity.updates.failedNumber of update attempts made by the SQL activity updater job that failed with errorsfailed updatesgiCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.stats.activity.updates.failedNumber of update attempts made by the SQL activity updater job that failed with errorsfailed updatesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.activity.updates.successfulNumber of successful updates made by the SQL activity updater jobsuccessful updatesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.cleanup.rows_removedNumber of stale statistics rows that are removedSQL Stats CleanupCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.stats.discarded.currentNumber of fingerprint statistics being discardedDiscarded SQL StatsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/sql/sql_activity_update_job.go b/pkg/sql/sql_activity_update_job.go index 35f899275470..a3307c3541dc 100644 --- a/pkg/sql/sql_activity_update_job.go +++ b/pkg/sql/sql_activity_update_job.go @@ -137,7 +137,7 @@ func newActivityUpdaterMetrics() metric.Struct { NumFailedUpdates: metric.NewCounter(metric.Metadata{ Name: "sql.stats.activity.updates.failed", Help: "Number of update attempts made by the SQL activity updater job that failed with errors", - Measurement: "failed updatesgi", + Measurement: "failed updates", Unit: metric.Unit_COUNT, MetricType: io_prometheus_client.MetricType_COUNTER, }),