Merge #120522

120522: pkg/sql: expand metrics used by the SQL stats activity update job r=koorosh a=abarganier pkg/sql: expand metrics used by the SQL stats activity update job Addresses: #119779 Epic: CRDB-24527 Currently, the SQL activity update job is lacking observability. While we have a metric for job failures, we've seen instances whe the query run by the job gets caught in a retry loop, meaning the metric is rarely incremented. Therefore, additional metrics, such as counts of successful runs, and the latency of successful runs, will be helpful to further inspect the state of the job. This patch adds metrics for both. We've also had escalations recently involving the SQL activity update job running for extended periods of time, such that the signal made to the job indicating a flush has completed was not received because there was no listener. While we've added a default case to prevent this from hanging the flush job, and some logging to go with it, a counter metric indicating when this occurs would also be useful to have when debugging. This patch adds such a counter. Finally, we rename the metric counting failures of the job to `sql.stats.activity_job.runs.failed`, as the old metric name was not descriptive. Release note (ops change): Two new metrics have been added to track the status of the SQL activity update job, which is used to pre-aggregate top K information within the SQL stats subsytem and write the results to `system.statement_activity` and `system.transaction_activity`. The new metrics are: - `sql.stats.activity_job.runs.successful`: Number of successful runs made by the SQL activity updater job - `sql.stats.activity_job.latency`: The latency of successful runs made by the SQL activity updater job Release note (ops change): A new counter metric, `sql.stats.flush.done_signals_ignored`, has been introduced. The metric tracks the number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed. This may indicate that the SQL Activity update job is taking longer than expected to complete. Release note (ops change): A new counter metric, `sql.stats.activity_job.runs.failed`, has been introduced to measure the number of runs made by the SQL activity updater job that failed with errors. The SQL activity update job is used to pre-aggregate top K information within the SQL stats subsystem and write the results to `system.statement_activity` and `system.transaction_activity`. Co-authored-by: Alex Barganier <[email protected]>
cockroachdb · Mar 18, 2024 · 6454abd · 6454abd
2 parents 6681efb + 099d0b0
commit 6454abd
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 23 deletions.
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
@@ -1096,7 +1096,7 @@
 <tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_completed</td><td>Number of key_visualizer jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_failed</td><td>Number of key_visualizer jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_retry_error</td><td>Number of key_visualizer jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
-<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics sql activity updater tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics poller tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>jobs.migration.currently_idle</td><td>Number of migration jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>jobs.migration.currently_paused</td><td>Number of migration jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>jobs.migration.currently_running</td><td>Number of migration jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
@@ -1464,11 +1464,16 @@
 <tr><td>APPLICATION</td><td>sql.service.latency.internal</td><td>Latency of SQL request execution (internal queries)</td><td>SQL Internal Statements</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
+<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
+<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updatesgi</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>sql.stats.flush.done_signals.ignored</td><td>Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed</td><td>flush done signals ignored</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.flush.duration</td><td>Time took to in nanoseconds to complete SQL Stats flush</td><td>SQL Stats Flush</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.flush.error</td><td>Number of errors encountered when flushing SQL Stats</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>APPLICATION</td><td>sql.stats.flush.fingerprint.count</td><td>The number of unique statement and transaction fingerprints included in the SQL Stats flush</td><td>statement &amp; transaction fingerprints</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.mem.current</td><td>Current memory usage for fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.mem.max</td><td>Memory usage for fingerprint storage</td><td>Memory</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
 <tr><td>APPLICATION</td><td>sql.stats.reported.mem.current</td><td>Current memory usage for reported fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>

diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go
@@ -484,13 +484,15 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
 		DB: NewInternalDB(
 			s, MemoryMetrics{}, sqlStatsInternalExecutorMonitor,
 		),
-		ClusterID:      s.cfg.NodeInfo.LogicalClusterID,
-		SQLIDContainer: cfg.NodeInfo.NodeID,
-		JobRegistry:    s.cfg.JobRegistry,
-		Knobs:          cfg.SQLStatsTestingKnobs,
-		FlushCounter:   serverMetrics.StatsMetrics.SQLStatsFlushStarted,
-		FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
-		FlushDuration:  serverMetrics.StatsMetrics.SQLStatsFlushDuration,
+		ClusterID:               s.cfg.NodeInfo.LogicalClusterID,
+		SQLIDContainer:          cfg.NodeInfo.NodeID,
+		JobRegistry:             s.cfg.JobRegistry,
+		Knobs:                   cfg.SQLStatsTestingKnobs,
+		FlushCounter:            serverMetrics.StatsMetrics.SQLStatsFlushStarted,
+		FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
+		FlushedFingerprintCount: serverMetrics.StatsMetrics.SQLStatsFlushFingerprintCount,
+		FailureCounter:          serverMetrics.StatsMetrics.SQLStatsFlushFailure,
+		FlushDuration:           serverMetrics.StatsMetrics.SQLStatsFlushDuration,
 	}, memSQLStats)
 
 	s.sqlStats = persistedSQLStats
@@ -588,7 +590,10 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
 			ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes),
 			DiscardedStatsCount:                 metric.NewCounter(MetaDiscardedSQLStats),
 			SQLStatsFlushStarted:                metric.NewCounter(MetaSQLStatsFlushStarted),
-			SQLStatsFlushFailure:                metric.NewCounter(MetaSQLStatsFlushFailure),
+			SQLStatsFlushDoneSignalsIgnored:     metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),
+			SQLStatsFlushFingerprintCount:       metric.NewCounter(MetaSQLStatsFlushFingerprintCount),
+
+			SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
 			SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
 				Mode:         metric.HistogramModePreferHdrLatency,
 				Metadata:     MetaSQLStatsFlushDuration,

diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go
@@ -125,6 +125,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/uuid"
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/redact"
+	io_prometheus_client "github.com/prometheus/client_model/go"
 )
 
 func init() {
@@ -1093,6 +1094,20 @@ var (
 		Measurement: "SQL Stats Flush",
 		Unit:        metric.Unit_COUNT,
 	}
+	MetaSQLStatsFlushFingerprintCount = metric.Metadata{
+		Name:        "sql.stats.flush.fingerprint.count",
+		Help:        "The number of unique statement and transaction fingerprints included in the SQL Stats flush",
+		Measurement: "statement & transaction fingerprints",
+		Unit:        metric.Unit_COUNT,
+	}
+	MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{
+		Name: "sql.stats.flush.done_signals.ignored",
+		Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " +
+			"a flush has completed",
+		Measurement: "flush done signals ignored",
+		Unit:        metric.Unit_COUNT,
+		MetricType:  io_prometheus_client.MetricType_COUNTER,
+	}
 	MetaSQLStatsFlushFailure = metric.Metadata{
 		Name:        "sql.stats.flush.error",
 		Help:        "Number of errors encountered when flushing SQL Stats",

diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go
@@ -75,10 +75,12 @@ type StatsMetrics struct {
 
 	DiscardedStatsCount *metric.Counter
 
-	SQLStatsFlushStarted  *metric.Counter
-	SQLStatsFlushFailure  *metric.Counter
-	SQLStatsFlushDuration metric.IHistogram
-	SQLStatsRemovedRows   *metric.Counter
+	SQLStatsFlushStarted            *metric.Counter
+	SQLStatsFlushDoneSignalsIgnored *metric.Counter
+	SQLStatsFlushFingerprintCount   *metric.Counter
+	SQLStatsFlushFailure            *metric.Counter
+	SQLStatsFlushDuration           metric.IHistogram
+	SQLStatsRemovedRows             *metric.Counter
 
 	SQLTxnStatsCollectionOverhead metric.IHistogram
 }

diff --git a/pkg/sql/sql_activity_update_job.go b/pkg/sql/sql_activity_update_job.go
@@ -15,6 +15,7 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/jobs"
 	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
 	"github.com/cockroachdb/cockroach/pkg/settings"
@@ -103,11 +104,15 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
 		case <-flushDoneSignal:
 			// A flush was done. Set the timer and wait for it to complete.
 			if sqlStatsActivityFlushEnabled.Get(&settings.SV) {
+				startTime := timeutil.Now().UnixNano()
 				updater := newSqlActivityUpdater(settings, execCtx.ExecCfg().InternalDB, nil)
 				if err := updater.TransferStatsToActivity(ctx); err != nil {
 					log.Warningf(ctx, "error running sql activity updater job: %v", err)
-					metrics.NumErrors.Inc(1)
+					metrics.NumFailedUpdates.Inc(1)
+				} else {
+					metrics.NumSuccessfulUpdates.Inc(1)
 				}
+				metrics.UpdateLatency.RecordValue(timeutil.Now().UnixNano() - startTime)
 			}
 		case <-ctx.Done():
 			return nil
@@ -120,20 +125,41 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
 // ActivityUpdaterMetrics must be public for metrics to get
 // registered
 type ActivityUpdaterMetrics struct {
-	NumErrors *metric.Counter
+	NumFailedUpdates     *metric.Counter
+	NumSuccessfulUpdates *metric.Counter
+	UpdateLatency        metric.IHistogram
 }
 
 func (m ActivityUpdaterMetrics) MetricStruct() {}
 
 func newActivityUpdaterMetrics() metric.Struct {
 	return ActivityUpdaterMetrics{
-		NumErrors: metric.NewCounter(metric.Metadata{
-			Name:        "jobs.metrics.task_failed",
-			Help:        "Number of metrics sql activity updater tasks that failed",
-			Measurement: "errors",
+		NumFailedUpdates: metric.NewCounter(metric.Metadata{
+			Name:        "sql.stats.activity.updates.failed",
+			Help:        "Number of update attempts made by the SQL activity updater job that failed with errors",
+			Measurement: "failed updatesgi",
 			Unit:        metric.Unit_COUNT,
 			MetricType:  io_prometheus_client.MetricType_COUNTER,
 		}),
+		NumSuccessfulUpdates: metric.NewCounter(metric.Metadata{
+			Name:        "sql.stats.activity.updates.successful",
+			Help:        "Number of successful updates made by the SQL activity updater job",
+			Measurement: "successful updates",
+			Unit:        metric.Unit_COUNT,
+			MetricType:  io_prometheus_client.MetricType_COUNTER,
+		}),
+		UpdateLatency: metric.NewHistogram(metric.HistogramOptions{
+			Metadata: metric.Metadata{
+				Name:        "sql.stats.activity.update.latency",
+				Help:        "The latency of updates made by the SQL activity updater job. Includes failed update attempts",
+				Measurement: "Nanoseconds",
+				Unit:        metric.Unit_NANOSECONDS,
+				MetricType:  io_prometheus_client.MetricType_HISTOGRAM,
+			},
+			Duration:     base.DefaultHistogramWindowInterval(),
+			BucketConfig: metric.LongRunning60mLatencyBuckets,
+			Mode:         metric.HistogramModePrometheus,
+		}),
 	}
 }
 

diff --git a/pkg/sql/sqlstats/persistedsqlstats/flush.go b/pkg/sql/sqlstats/persistedsqlstats/flush.go
@@ -62,9 +62,11 @@ func (s *PersistedSQLStats) Flush(ctx context.Context, stopper *stop.Stopper) {
 		return
 	}
 
+	fingerprintCount := s.SQLStats.GetTotalFingerprintCount()
+	s.cfg.FlushedFingerprintCount.Inc(fingerprintCount)
 	if log.V(1) {
 		log.Infof(ctx, "flushing %d stmt/txn fingerprints (%d bytes) after %s",
-			s.SQLStats.GetTotalFingerprintCount(), s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
+			fingerprintCount, s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
 	}
 	s.lastFlushStarted = now
 

diff --git a/pkg/sql/sqlstats/persistedsqlstats/provider.go b/pkg/sql/sqlstats/persistedsqlstats/provider.go
@@ -46,9 +46,11 @@ type Config struct {
 	JobRegistry             *jobs.Registry
 
 	// Metrics.
-	FlushCounter   *metric.Counter
-	FlushDuration  metric.IHistogram
-	FailureCounter *metric.Counter
+	FlushCounter            *metric.Counter
+	FlushDuration           metric.IHistogram
+	FlushDoneSignalsIgnored *metric.Counter
+	FailureCounter          *metric.Counter
+	FlushedFingerprintCount *metric.Counter
 
 	// Testing knobs.
 	Knobs *sqlstats.TestingKnobs
@@ -211,6 +213,7 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper
 					// Don't block the flush loop if the sql activity update job is not
 					// ready to receive. We should at least continue to collect and flush
 					// stats for this node.
+					s.cfg.FlushDoneSignalsIgnored.Inc(1)
 					log.Warning(ctx, "sql-stats-worker: unable to signal flush completion")
 				}
 			}