Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/sql/sqlstats: rework SQL stats flush metrics #120709

Merged
merged 2 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1474,15 +1474,15 @@
<tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updatesgi</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.done_signals.ignored</td><td>Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed</td><td>flush done signals ignored</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.duration</td><td>Time took to in nanoseconds to complete SQL Stats flush</td><td>SQL Stats Flush</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.error</td><td>Number of errors encountered when flushing SQL Stats</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.fingerprint.count</td><td>The number of unique statement and transaction fingerprints included in the SQL Stats flush</td><td>statement &amp; transaction fingerprints</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.latency</td><td>The latency of SQL Stats flushes to persistent storage. Includes failed flush attempts</td><td>nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flushes.failed</td><td>Number of attempted SQL Stats flushes that failed with errors</td><td>failed flushes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flushes.successful</td><td>Number of times SQL Stats are flushed successfully to persistent storage</td><td>successful flushes</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.current</td><td>Current memory usage for fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.max</td><td>Memory usage for fingerprint storage</td><td>Memory</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.reported.mem.current</td><td>Current memory usage for reported fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
Expand Down
14 changes: 7 additions & 7 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,11 +488,11 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
SQLIDContainer: cfg.NodeInfo.NodeID,
JobRegistry: s.cfg.JobRegistry,
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FlushesSuccessful: serverMetrics.StatsMetrics.SQLStatsFlushesSuccessful,
FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
FlushedFingerprintCount: serverMetrics.StatsMetrics.SQLStatsFlushFingerprintCount,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
FlushesFailed: serverMetrics.StatsMetrics.SQLStatsFlushesFailed,
FlushLatency: serverMetrics.StatsMetrics.SQLStatsFlushLatency,
}, memSQLStats)

s.sqlStats = persistedSQLStats
Expand Down Expand Up @@ -589,14 +589,14 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
}),
ReportedSQLStatsMemoryCurBytesCount: metric.NewGauge(MetaReportedSQLStatsMemCurBytes),
DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats),
SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted),
SQLStatsFlushesSuccessful: metric.NewCounter(MetaSQLStatsFlushesSuccessful),
SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),
SQLStatsFlushFingerprintCount: metric.NewCounter(MetaSQLStatsFlushFingerprintCount),

SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
SQLStatsFlushesFailed: metric.NewCounter(MetaSQLStatsFlushesFailed),
SQLStatsFlushLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Metadata: MetaSQLStatsFlushDuration,
Metadata: MetaSQLStatsFlushLatency,
Duration: 6 * metricsSampleInterval,
BucketConfig: metric.IOLatencyBuckets,
}),
Expand Down
24 changes: 12 additions & 12 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1088,10 +1088,10 @@ var (
Measurement: "Discarded SQL Stats",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushStarted = metric.Metadata{
Name: "sql.stats.flush.count",
Help: "Number of times SQL Stats are flushed to persistent storage",
Measurement: "SQL Stats Flush",
MetaSQLStatsFlushesSuccessful = metric.Metadata{
Name: "sql.stats.flushes.successful",
Help: "Number of times SQL Stats are flushed successfully to persistent storage",
Measurement: "successful flushes",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushFingerprintCount = metric.Metadata{
Expand All @@ -1108,16 +1108,16 @@ var (
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}
MetaSQLStatsFlushFailure = metric.Metadata{
Name: "sql.stats.flush.error",
Help: "Number of errors encountered when flushing SQL Stats",
Measurement: "SQL Stats Flush",
MetaSQLStatsFlushesFailed = metric.Metadata{
Name: "sql.stats.flushes.failed",
Help: "Number of attempted SQL Stats flushes that failed with errors",
Measurement: "failed flushes",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushDuration = metric.Metadata{
Name: "sql.stats.flush.duration",
Help: "Time took to in nanoseconds to complete SQL Stats flush",
Measurement: "SQL Stats Flush",
MetaSQLStatsFlushLatency = metric.Metadata{
Name: "sql.stats.flush.latency",
Help: "The latency of SQL Stats flushes to persistent storage. Includes failed flush attempts",
Measurement: "nanoseconds",
Unit: metric.Unit_NANOSECONDS,
}
MetaSQLStatsRemovedRows = metric.Metadata{
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ type StatsMetrics struct {

DiscardedStatsCount *metric.Counter

SQLStatsFlushStarted *metric.Counter
SQLStatsFlushesSuccessful *metric.Counter
SQLStatsFlushDoneSignalsIgnored *metric.Counter
SQLStatsFlushFingerprintCount *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsFlushesFailed *metric.Counter
SQLStatsFlushLatency metric.IHistogram
SQLStatsRemovedRows *metric.Counter

SQLTxnStatsCollectionOverhead metric.IHistogram
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/sql_activity_update_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func newActivityUpdaterMetrics() metric.Struct {
NumFailedUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.failed",
Help: "Number of update attempts made by the SQL activity updater job that failed with errors",
Measurement: "failed updatesgi",
Measurement: "failed updates",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
Expand Down
7 changes: 4 additions & 3 deletions pkg/sql/sqlstats/persistedsqlstats/flush.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,13 @@ func (s *PersistedSQLStats) doFlush(ctx context.Context, workFn func() error, er

defer func() {
if err != nil {
s.cfg.FailureCounter.Inc(1)
s.cfg.FlushesFailed.Inc(1)
log.Warningf(ctx, "%s: %s", errMsg, err)
} else {
s.cfg.FlushesSuccessful.Inc(1)
}
flushDuration := s.getTimeNow().Sub(flushBegin)
s.cfg.FlushDuration.RecordValue(flushDuration.Nanoseconds())
s.cfg.FlushCounter.Inc(1)
s.cfg.FlushLatency.RecordValue(flushDuration.Nanoseconds())
}()

err = workFn()
Expand Down
6 changes: 3 additions & 3 deletions pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ type Config struct {
JobRegistry *jobs.Registry

// Metrics.
FlushCounter *metric.Counter
FlushDuration metric.IHistogram
FlushesSuccessful *metric.Counter
FlushLatency metric.IHistogram
FlushDoneSignalsIgnored *metric.Counter
FailureCounter *metric.Counter
FlushesFailed *metric.Counter
FlushedFingerprintCount *metric.Counter

// Testing knobs.
Expand Down
Loading