Skip to content

Commit

Permalink
pkg/sql: expand metrics used by the SQL stats activity update job
Browse files Browse the repository at this point in the history
Addresses: cockroachdb#119779

Currently, the SQL activity update job is lacking observability. While
we have a metric for job failures, we've seen instances whe the query
run by the job gets caught in a retry loop, meaning the metric is rarely
incremented.

Therefore, additional metrics, such as counts of successful runs, and
the latency of successful runs, will be helpful to further inspect the
state of the job.

This patch adds metrics for both.

Release note (ops change): Two new metrics have been added to track the
status of the SQL activity update job, which is used to pre-aggregate
top K information within the SQL stats subsytem and write the results to
`system.statement_activity` and `system.transaction_activity`.

The new metrics are:
- `sql.stats.activity.updates.successful`: Number of successful updates made
   by the SQL activity updater job.
- `sql.stats.activity.update.latency`: The latency of updates  made by
  the SQL activity updater job. Includes failed update attempts.
  • Loading branch information
abarganier committed May 10, 2024
1 parent a2d47a1 commit 66be2ae
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
2 changes: 2 additions & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,8 @@
<tr><td>APPLICATION</td><td>sql.service.latency.internal</td><td>Latency of SQL request execution (internal queries)</td><td>SQL Internal Statements</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
28 changes: 27 additions & 1 deletion pkg/sql/sql_activity_update_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings"
Expand Down Expand Up @@ -103,11 +104,15 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
case <-flushDoneSignal:
// A flush was done. Set the timer and wait for it to complete.
if sqlStatsActivityFlushEnabled.Get(&settings.SV) {
startTime := timeutil.Now().UnixNano()
updater := newSqlActivityUpdater(settings, execCtx.ExecCfg().InternalDB, nil)
if err := updater.TransferStatsToActivity(ctx); err != nil {
log.Warningf(ctx, "error running sql activity updater job: %v", err)
metrics.NumErrors.Inc(1)
} else {
metrics.NumSuccessfulUpdates.Inc(1)
}
metrics.UpdateLatency.RecordValue(timeutil.Now().UnixNano() - startTime)
}
case <-ctx.Done():
return nil
Expand All @@ -120,7 +125,9 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
// ActivityUpdaterMetrics must be public for metrics to get
// registered
type ActivityUpdaterMetrics struct {
NumErrors *metric.Counter
NumErrors *metric.Counter
NumSuccessfulUpdates *metric.Counter
UpdateLatency metric.IHistogram
}

func (m ActivityUpdaterMetrics) MetricStruct() {}
Expand All @@ -134,6 +141,25 @@ func newActivityUpdaterMetrics() metric.Struct {
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
NumSuccessfulUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.successful",
Help: "Number of successful updates made by the SQL activity updater job",
Measurement: "successful updates",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
UpdateLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metric.Metadata{
Name: "sql.stats.activity.update.latency",
Help: "The latency of updates made by the SQL activity updater job. Includes failed update attempts",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
MetricType: io_prometheus_client.MetricType_HISTOGRAM,
},
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.LongRunning60mLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
}
}

Expand Down

0 comments on commit 66be2ae

Please sign in to comment.