Skip to content

Commit

Permalink
pkg/sql: rename metric tracking SQL activity update job failures
Browse files Browse the repository at this point in the history
The metric used to track failures of the SQL Activity update job
didn't have a descriptive name, and the help text was grammatically
incorrect. Furthermore, the metric name is the same as a metric used
within the job system, meaning one of these metrics is probably
clobbering the other when writing to TSDB or outputting to
`/_status/vars`.

This patch simply updates the metric name to better describe what it
measures, and fixes the help text description.

Release note (ops change): A new counter metric,
`sql.stats.activity.updates.failed`, has been introduced to measure the
number of update attempts made by the SQL activity updater job that failed with
errors. The SQL activity update job is used to pre-aggregate top K
information within the SQL stats subsystem and write the results to
`system.statement_activity` and `system.transaction_activity`.
  • Loading branch information
abarganier committed May 10, 2024
1 parent 455cf39 commit 4d41e76
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
3 changes: 2 additions & 1 deletion docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_completed</td><td>Number of key_visualizer jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_failed</td><td>Number of key_visualizer jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_retry_error</td><td>Number of key_visualizer jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics sql activity updater tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics poller tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_idle</td><td>Number of migration jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_paused</td><td>Number of migration jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_running</td><td>Number of migration jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down Expand Up @@ -1444,6 +1444,7 @@
<tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updatesgi</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
12 changes: 6 additions & 6 deletions pkg/sql/sql_activity_update_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
updater := newSqlActivityUpdater(settings, execCtx.ExecCfg().InternalDB, nil)
if err := updater.TransferStatsToActivity(ctx); err != nil {
log.Warningf(ctx, "error running sql activity updater job: %v", err)
metrics.NumErrors.Inc(1)
metrics.NumFailedUpdates.Inc(1)
} else {
metrics.NumSuccessfulUpdates.Inc(1)
}
Expand All @@ -125,7 +125,7 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
// ActivityUpdaterMetrics must be public for metrics to get
// registered
type ActivityUpdaterMetrics struct {
NumErrors *metric.Counter
NumFailedUpdates *metric.Counter
NumSuccessfulUpdates *metric.Counter
UpdateLatency metric.IHistogram
}
Expand All @@ -134,10 +134,10 @@ func (m ActivityUpdaterMetrics) MetricStruct() {}

func newActivityUpdaterMetrics() metric.Struct {
return ActivityUpdaterMetrics{
NumErrors: metric.NewCounter(metric.Metadata{
Name: "jobs.metrics.task_failed",
Help: "Number of metrics sql activity updater tasks that failed",
Measurement: "errors",
NumFailedUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.failed",
Help: "Number of update attempts made by the SQL activity updater job that failed with errors",
Measurement: "failed updatesgi",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
Expand Down

0 comments on commit 4d41e76

Please sign in to comment.