Skip to content

Commit

Permalink
metrics: expose pebble fsync latency as prometheus metric
Browse files Browse the repository at this point in the history
Leverage the metric callback exposed in pebble to update the store’s
histogram. However since the callbacks must be setup in `pebble.Open`
but the metric to update is part of the `kvserver/metric.go`. We store
the callbacks on the `pebble.onMetricEvent pebble.MetricEventListener`
and craft a struct that wraps the callbacks with a `nil` check to
ensure that they have been set inside of `kvserver/store.go` store
start method:`Store.Start()`. This allows for deferring the setting of
the callback handlers until we have access to the metrics to update.

Diagram of the above description:

```
     type Pebble struct {
       onMetricEvent struct {
  +--->  SomeCallback func(duration time.Duration) <--------------+
  |    }                                                          |
  |  }                                                            |
  |                                                               |  Checks
  |  func NewPebble(...) {                                        |
  |    ...                                                        |
  |    cfg.Opts.MetricEventListener = pebble.MetricEventListener{ |
  |      SomeCallback: func(duration time.Duration) {             |
S |    	   if p.onMetricEvent.SomeCallback != nil { +-------------^
e |    	     p.onMetricEvent.SomeCallback(duration) +-------------v
t |    	   }                                                      |
s |    	 },                                                       |
  |    }                                                          |
  |                                                               |
  |    db, err := pebble.Open(cfg.StorageConfig.Dir, cfg.Opts)    |
  |    ...                                                        |  Calls
  |  }                                                            |
  |                                                               |
  |  func (s *Store) Start(...) {                                 |
  |    ...                                                        |
  ^--- s.engine.RegisterMetricEventListener(                      |
         pebble.MetricEventListener{                              |
           SomeCallback: func(duration time.Duration) {           |
             s.metrics.FsyncLatency.RecordValue(duration)<--------v
           }
         })
       ...
     }
```

Release note: None
  • Loading branch information
coolcom200 committed Oct 11, 2022
1 parent f70bb35 commit 347c019
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 8 deletions.
12 changes: 12 additions & 0 deletions monitoring/rules/aggregation.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,15 @@ groups:
expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m)
- record: raft_process_commandcommit_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m)
- record: pebble_fsync_latency_bucket:rate1m
expr: rate(pebble_fsync_latency_bucket{job="cockroachdb"}[1m])
- record: pebble_fsync_latency:rate1m:quantile_50
expr: histogram_quantile(0.5, pebble_fsync_latency_bucket:rate1m)
- record: pebble_fsync_latency:rate1m:quantile_75
expr: histogram_quantile(0.75, pebble_fsync_latency_bucket:rate1m)
- record: pebble_fsync_latency:rate1m:quantile_90
expr: histogram_quantile(0.9, pebble_fsync_latency_bucket:rate1m)
- record: pebble_fsync_latency:rate1m:quantile_95
expr: histogram_quantile(0.95, pebble_fsync_latency_bucket:rate1m)
- record: pebble_fsync_latency:rate1m:quantile_99
expr: histogram_quantile(0.99, pebble_fsync_latency_bucket:rate1m)
9 changes: 9 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,13 @@ Note that the measurement does not include the duration for replicating the eval
Measurement: "Flush Utilization",
Unit: metric.Unit_PERCENT,
}

metaPebbleFsyncLatency = metric.Metadata{
Name: "pebble.fsync.latency",
Help: "The pebble write ahead log writer fsync latency",
Measurement: "Fsync Latency",
Unit: metric.Unit_NANOSECONDS,
}
)

// StoreMetrics is the set of metrics for a given store.
Expand Down Expand Up @@ -1965,6 +1972,7 @@ type StoreMetrics struct {
ReplicaWriteBatchEvaluationLatency *metric.Histogram

FlushUtilization *metric.GaugeFloat64
FsyncLatency *metric.Histogram
}

type tenantMetricsRef struct {
Expand Down Expand Up @@ -2503,6 +2511,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
metaReplicaWriteBatchEvaluationLatency, histogramWindow, metric.IOLatencyBuckets,
),
FlushUtilization: metric.NewGaugeFloat64(metaPebbleFlushUtilization),
FsyncLatency: metric.NewHistogram(metaPebbleFsyncLatency, histogramWindow, metric.IOLatencyBuckets),
}

{
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1880,6 +1880,10 @@ func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error {
})
s.metrics.registry.AddMetricStruct(s.intentResolver.Metrics)

s.engine.RegisterMetricEventListener(pebble.MetricEventListener{WALFsyncLatency: func(duration time.Duration) {
s.metrics.FsyncLatency.RecordValue(duration.Microseconds())
}})

// Create the raft log truncator and register the callback.
s.raftTruncator = makeRaftLogTruncator(s.cfg.AmbientCtx, (*storeForTruncatorImpl)(s), stopper)
{
Expand Down
5 changes: 5 additions & 0 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,11 @@ type Engine interface {
// of the callback since it could cause a deadlock (since the callback may
// be invoked while holding mutexes).
RegisterFlushCompletedCallback(cb func())
// RegisterMetricEventListener registers the provided listener. The listener
// has a collection of callback functions. Call this will cause the listener
// to replaces the existing listener struct. Each callback in the
// pebble.MetricEventListener is invoked separately.
RegisterMetricEventListener(listener pebble.MetricEventListener)
// Filesystem functionality.
fs.FS
// CreateCheckpoint creates a checkpoint of the engine in the given directory,
Expand Down
16 changes: 8 additions & 8 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ type Pebble struct {
diskSlowCount int64
diskStallCount int64

onMetricCallback pebble.MetricCallbacks
onMetricEvent pebble.MetricEventListener

// Relevant options copied over from pebble.Options.
fs vfs.FS
Expand Down Expand Up @@ -933,10 +933,10 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
return nil, err
}

cfg.Opts.OnMetrics = pebble.MetricCallbacks{
LogWriterFsyncLatency: func(duration int64) {
if p.onMetricCallback.LogWriterFsyncLatency != nil {
p.onMetricCallback.LogWriterFsyncLatency(duration)
cfg.Opts.MetricEventListener = pebble.MetricEventListener{
WALFsyncLatency: func(duration time.Duration) {
if p.onMetricEvent.WALFsyncLatency != nil {
p.onMetricEvent.WALFsyncLatency(duration)
}
},
}
Expand Down Expand Up @@ -1703,9 +1703,9 @@ func (p *Pebble) RegisterFlushCompletedCallback(cb func()) {
p.mu.Unlock()
}

// RegisterMetricCallbacks implements the Engine interface
func (p *Pebble) RegisterMetricCallbacks(callbacks pebble.MetricCallbacks) {
p.onMetricCallback = callbacks
// RegisterMetricEventListener implements the Engine interface
func (p *Pebble) RegisterMetricEventListener(listener pebble.MetricEventListener) {
p.onMetricEvent = listener
}

// Remove implements the FS interface.
Expand Down

0 comments on commit 347c019

Please sign in to comment.