Skip to content

Commit

Permalink
pkg/sql/sqlstats: counter metric for flushed fingerprint counts
Browse files Browse the repository at this point in the history
Addresses: cockroachdb#119779

The count of unique fingeprints flushed to `system.statement_statistics`
and `system.transaction_statistics` is the core component that
determines data cardinality within the SQL stats subsystem. Today, we
don't have good metrics around this source of cardinality. As we aim to
reduce cardinality by improving our fingerprinting algorithms, creating
a metric to count the number of unique statement and transaction
fingerprints included in each flush of the in-memory SQL stats will be a
helpful measurement to benchmark cardinality reduction.

This patch adds a new metric to track the # of unique fingerprints (stmt
and txn) included in each flush.

Release note (ops change): A new counter metric,
`sql.stats.flush.fingerprint.count`, has been introduced. The metric
tracks the number of unique statement and transaction fingerprints
included in the SQL Stats flush.
  • Loading branch information
abarganier committed May 13, 2024
1 parent 4d41e76 commit c52aa2b
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 3 deletions.
3 changes: 2 additions & 1 deletion docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1444,14 +1444,15 @@
<tr><td>APPLICATION</td><td>sql.statements.active</td><td>Number of currently active user SQL statements</td><td>Active Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.statements.active.internal</td><td>Number of currently active user SQL statements (internal queries)</td><td>SQL Internal Statements</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.update.latency</td><td>The latency of updates made by the SQL activity updater job. Includes failed update attempts</td><td>Nanoseconds</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updatesgi</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.failed</td><td>Number of update attempts made by the SQL activity updater job that failed with errors</td><td>failed updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.activity.updates.successful</td><td>Number of successful updates made by the SQL activity updater job</td><td>successful updates</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.cleanup.rows_removed</td><td>Number of stale statistics rows that are removed</td><td>SQL Stats Cleanup</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.discarded.current</td><td>Number of fingerprint statistics being discarded</td><td>Discarded SQL Stats</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.count</td><td>Number of times SQL Stats are flushed to persistent storage</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.done_signals.ignored</td><td>Number of times the SQL Stats activity update job ignored the signal sent to it indicating a flush has completed</td><td>flush done signals ignored</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.duration</td><td>Time took to in nanoseconds to complete SQL Stats flush</td><td>SQL Stats Flush</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.error</td><td>Number of errors encountered when flushing SQL Stats</td><td>SQL Stats Flush</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.flush.fingerprint.count</td><td>The number of unique statement and transaction fingerprints included in the SQL Stats flush</td><td>statement &amp; transaction fingerprints</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.current</td><td>Current memory usage for fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.mem.max</td><td>Memory usage for fingerprint storage</td><td>Memory</td><td>HISTOGRAM</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.stats.reported.mem.current</td><td>Current memory usage for reported fingerprint storage</td><td>Memory</td><td>GAUGE</td><td>BYTES</td><td>AVG</td><td>NONE</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
Knobs: cfg.SQLStatsTestingKnobs,
FlushCounter: serverMetrics.StatsMetrics.SQLStatsFlushStarted,
FlushDoneSignalsIgnored: serverMetrics.StatsMetrics.SQLStatsFlushDoneSignalsIgnored,
FlushedFingerprintCount: serverMetrics.StatsMetrics.SQLStatsFlushFingerprintCount,
FailureCounter: serverMetrics.StatsMetrics.SQLStatsFlushFailure,
FlushDuration: serverMetrics.StatsMetrics.SQLStatsFlushDuration,
}, memSQLStats)
Expand Down Expand Up @@ -587,6 +588,7 @@ func makeServerMetrics(cfg *ExecutorConfig) ServerMetrics {
DiscardedStatsCount: metric.NewCounter(MetaDiscardedSQLStats),
SQLStatsFlushStarted: metric.NewCounter(MetaSQLStatsFlushStarted),
SQLStatsFlushDoneSignalsIgnored: metric.NewCounter(MetaSQLStatsFlushDoneSignalsIgnored),
SQLStatsFlushFingerprintCount: metric.NewCounter(MetaSQLStatsFlushFingerprintCount),

SQLStatsFlushFailure: metric.NewCounter(MetaSQLStatsFlushFailure),
SQLStatsFlushDuration: metric.NewHistogram(metric.HistogramOptions{
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,12 @@ var (
Measurement: "SQL Stats Flush",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushFingerprintCount = metric.Metadata{
Name: "sql.stats.flush.fingerprint.count",
Help: "The number of unique statement and transaction fingerprints included in the SQL Stats flush",
Measurement: "statement & transaction fingerprints",
Unit: metric.Unit_COUNT,
}
MetaSQLStatsFlushDoneSignalsIgnored = metric.Metadata{
Name: "sql.stats.flush.done_signals.ignored",
Help: "Number of times the SQL Stats activity update job ignored the signal sent to it indicating " +
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type StatsMetrics struct {

SQLStatsFlushStarted *metric.Counter
SQLStatsFlushDoneSignalsIgnored *metric.Counter
SQLStatsFlushFingerprintCount *metric.Counter
SQLStatsFlushFailure *metric.Counter
SQLStatsFlushDuration metric.IHistogram
SQLStatsRemovedRows *metric.Counter
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/sql_activity_update_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func newActivityUpdaterMetrics() metric.Struct {
NumFailedUpdates: metric.NewCounter(metric.Metadata{
Name: "sql.stats.activity.updates.failed",
Help: "Number of update attempts made by the SQL activity updater job that failed with errors",
Measurement: "failed updatesgi",
Measurement: "failed updates",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_COUNTER,
}),
Expand Down
4 changes: 3 additions & 1 deletion pkg/sql/sqlstats/persistedsqlstats/flush.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,10 @@ func (s *PersistedSQLStats) Flush(ctx context.Context) {
return
}

fingerprintCount := s.SQLStats.GetTotalFingerprintCount()
s.cfg.FlushedFingerprintCount.Inc(fingerprintCount)
log.Infof(ctx, "flushing %d stmt/txn fingerprints (%d bytes) after %s",
s.SQLStats.GetTotalFingerprintCount(), s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
fingerprintCount, s.SQLStats.GetTotalFingerprintBytes(), timeutil.Since(s.lastFlushStarted))
s.lastFlushStarted = now

aggregatedTs := s.ComputeAggregatedTs()
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type Config struct {
FlushDuration metric.IHistogram
FlushDoneSignalsIgnored *metric.Counter
FailureCounter *metric.Counter
FlushedFingerprintCount *metric.Counter

// Testing knobs.
Knobs *sqlstats.TestingKnobs
Expand Down

0 comments on commit c52aa2b

Please sign in to comment.