Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/sql: expand metrics used by the SQL stats activity update job #120522

Merged
merged 4 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,7 @@
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_completed</td><td>Number of key_visualizer jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_failed</td><td>Number of key_visualizer jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.key_visualizer.resume_retry_error</td><td>Number of key_visualizer jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics sql activity updater tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.metrics.task_failed</td><td>Number of metrics poller tasks that failed</td><td>errors</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_idle</td><td>Number of migration jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_paused</td><td>Number of migration jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.migration.currently_running</td><td>Number of migration jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down Expand Up @@ -1464,11 +1464,16 @@
<tr><td>APPLICATION</td><td>sql.service.latency.internal</td><td>Latency of SQL request execution (internal queries)</td><td>SQL Internal Statements</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updatesgi</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.done_signals.ignored</td><td>Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed</td><td>flush done signals ignored</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.duration</td><td>Time took to in nanoseconds to complete SQL Stats flush</td><td>SQL Stats Flush</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.error</td><td>Number of errors encountered when flushing SQL Stats</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.fingerprint.count</td><td>The number of unique statement and transaction fingerprints included in the SQL Stats flush</td><td>statement &amp; transaction fingerprints</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.current</td><td>Current memory usage for fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.max</td><td>Memory usage for fingerprint storage</td><td>Memory</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.reported.mem.current</td><td>Current memory usage for reported fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
Expand Down
21 changes: 13 additions & 8 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,13 +484,15 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
DB: NewInternalDB(
s, MemoryMetrics{}, sqlStatsInternalExecutorMonitor,
),
ClusterID: s.cfg.NodeInfo.LogicalClusterID,
SQLIDContainer: cfg.NodeInfo.NodeID,
JobRegistry: s.cfg.JobRegistry,
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
ClusterID: s.cfg.NodeInfo.LogicalClusterID,
SQLIDContainer: cfg.NodeInfo.NodeID,
JobRegistry: s.cfg.JobRegistry,
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
FlushedFingerprintCount: serverMetrics.StatsMetrics.SQLStatsFlushFingerprintCount,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
}, memSQLStats)

s.sqlStats = persistedSQLStats
Expand Down Expand Up @@ -588,7 +590,10 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes),
DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats),
SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted),
SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),
SQLStatsFlushFingerprintCount: metric.NewCounter(MetaSQLStatsFlushFingerprintCount),

SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: MetaSQLStatsFlushDuration,
Expand Down
15 changes: 15 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
io_prometheus_client "github.com/prometheus/client_model/go"
)

func init() {
Expand Down Expand Up @@ -1093,6 +1094,20 @@ var (
Measurement: "SQL Stats Flush",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushFingerprintCount = metric.Metadata{
Name: "sql.stats.flush.fingerprint.count",
Help: "The number of unique statement and transaction fingerprints included in the SQL Stats flush",
Measurement: "statement & transaction fingerprints",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{
Name: "sql.stats.flush.done_signals.ignored",
Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " +
"a flush has completed",
Measurement: "flush done signals ignored",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}
MetaSQLStatsFlushFailure = metric.Metadata{
Name: "sql.stats.flush.error",
Help: "Number of errors encountered when flushing SQL Stats",
Expand Down
10 changes: 6 additions & 4 deletions pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,12 @@ type StatsMetrics struct {

DiscardedStatsCount *metric.Counter

SQLStatsFlushStarted *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsRemovedRows *metric.Counter
SQLStatsFlushStarted *metric.Counter
SQLStatsFlushDoneSignalsIgnored *metric.Counter
SQLStatsFlushFingerprintCount *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsRemovedRows *metric.Counter

SQLTxnStatsCollectionOverhead metric.IHistogram
}
Expand Down
38 changes: 32 additions & 6 deletions pkg/sql/sql_activity_update_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings"
Expand Down Expand Up @@ -103,11 +104,15 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
case <-flushDoneSignal:
// A flush was done. Set the timer and wait for it to complete.
if sqlStatsActivityFlushEnabled.Get(&settings.SV) {
startTime := timeutil.Now().UnixNano()
updater := newSqlActivityUpdater(settings, execCtx.ExecCfg().InternalDB, nil)
if err := updater.TransferStatsToActivity(ctx); err != nil {
log.Warningf(ctx, "error running sql activity updater job: %v", err)
metrics.NumErrors.Inc(1)
metrics.NumFailedUpdates.Inc(1)
} else {
metrics.NumSuccessfulUpdates.Inc(1)
}
metrics.UpdateLatency.RecordValue(timeutil.Now().UnixNano() - startTime)
}
case <-ctx.Done():
return nil
Expand All @@ -120,20 +125,41 @@ func (j *sqlActivityUpdateJob) Resume(ctx context.Context, execCtxI interface{})
// ActivityUpdaterMetrics must be public for metrics to get
// registered
type ActivityUpdaterMetrics struct {
NumErrors *metric.Counter
NumFailedUpdates *metric.Counter
NumSuccessfulUpdates *metric.Counter
UpdateLatency metric.IHistogram
}

func (m ActivityUpdaterMetrics) MetricStruct() {}

func newActivityUpdaterMetrics() metric.Struct {
return ActivityUpdaterMetrics{
NumErrors: metric.NewCounter(metric.Metadata{
Name: "jobs.metrics.task_failed",
Help: "Number of metrics sql activity updater tasks that failed",
Measurement: "errors",
NumFailedUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.failed",
Help: "Number of update attempts made by the SQL activity updater job that failed with errors",
Measurement: "failed updatesgi",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
NumSuccessfulUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.successful",
Help: "Number of successful updates made by the SQL activity updater job",
Measurement: "successful updates",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
UpdateLatency: metric.NewHistogram(metric.HistogramOptions{
Metadata: metric.Metadata{
Name: "sql.stats.activity.update.latency",
Help: "The latency of updates made by the SQL activity updater job. Includes failed update attempts",
Measurement: "Nanoseconds",
Unit: metric.Unit_NANOSECONDS,
MetricType: io_prometheus_client.MetricType_HISTOGRAM,
},
Duration: base.DefaultHistogramWindowInterval(),
BucketConfig: metric.LongRunning60mLatencyBuckets,
Mode: metric.HistogramModePrometheus,
}),
}
}

Expand Down
4 changes: 3 additions & 1 deletion pkg/sql/sqlstats/persistedsqlstats/flush.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,11 @@ func (s *PersistedSQLStats) Flush(ctx context.Context, stopper *stop.Stopper) {
return
}

fingerprintCount := s.SQLStats.GetTotalFingerprintCount()
s.cfg.FlushedFingerprintCount.Inc(fingerprintCount)
if log.V(1) {
log.Infof(ctx, "flushing %d stmt/txn fingerprints (%d bytes) after %s",
s.SQLStats.GetTotalFingerprintCount(), s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
fingerprintCount, s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
}
s.lastFlushStarted = now

Expand Down
9 changes: 6 additions & 3 deletions pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@ type Config struct {
JobRegistry *jobs.Registry

// Metrics.
FlushCounter *metric.Counter
FlushDuration metric.IHistogram
FailureCounter *metric.Counter
FlushCounter *metric.Counter
FlushDuration metric.IHistogram
FlushDoneSignalsIgnored *metric.Counter
FailureCounter *metric.Counter
FlushedFingerprintCount *metric.Counter

// Testing knobs.
Knobs *sqlstats.TestingKnobs
Expand Down Expand Up @@ -211,6 +213,7 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper
// Don't block the flush loop if the sql activity update job is not
// ready to receive. We should at least continue to collect and flush
// stats for this node.
s.cfg.FlushDoneSignalsIgnored.Inc(1)
log.Warning(ctx, "sql-stats-worker: unable to signal flush completion")
}
}
Expand Down
Loading