Skip to content

Commit

Permalink
metric: add prometheus-based histogram
Browse files Browse the repository at this point in the history
Our current histogram is based on `hdrhistogram`. This tends to create
lots of buckets and is inflexible w.r.t the bucket layout. In hindsight,
it was a poor choice of default implementation (I can say that since I
introduced it) and its cost is disincentivizing the introduction of
histograms that would be useful.

This commit introduces a histogram that is based on a completely vanilla
`prometheus.Histogram`. The only reason we need to wrap it is because we
want to export quantiles to CockraochDB's internal timeseries (it does
not support histograms) and this requires maintaining an internal windowed
histogram (on top of the cumulative histogram).

With this done, we can now introduce metrics with any kind of buckets we
want. Helpfully, we introduce two common kinds of buckets, suitable for
IO-type and RPC-type latencies. These are defined in a human-readable
format by explicitly listing out the buckets.

We can move existing metrics to HistogramV2 easily, assuming we are not
concerned with existing prometheus scrapers getting tripped up by the
changes in bucket boundaries. I assume this is not a big deal in
practice as long as it doesn't happen "all the time". In fact, I would
strongly suggest we move all metrics wholesale and remove the
hdrhistogram-based implementation. If this is not acceptable for some
reason, we ought to at least deprecated it.

We also slightly improve the existing `Histogram` code by unifying how
the windowed histograms are ticked and by making explicit where their
quantiles are recorded (this dependency was previously hidden via a
local interface assertion).

Resolves #10015.
Resolves #64962.
Alternative to dhartunian@eac3d06

TODO
- export quantiles (how to do this? Not clear, don't see the code
  laying around in prometheus, might have to hand-roll it but should
  be easy enough)

Release note: None
  • Loading branch information
tbg authored and aadityasondhi committed Aug 26, 2022
1 parent aa4f001 commit eb82a43
Show file tree
Hide file tree
Showing 10 changed files with 291 additions and 83 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ go_library(
"@com_github_gogo_protobuf//proto",
"@com_github_google_btree//:btree",
"@com_github_kr_pretty//:pretty",
"@com_github_prometheus_client_golang//prometheus",
"@io_etcd_go_etcd_raft_v3//:raft",
"@io_etcd_go_etcd_raft_v3//raftpb",
"@io_etcd_go_etcd_raft_v3//tracker",
Expand Down
9 changes: 6 additions & 3 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/slidingwindow"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/prometheus/client_golang/prometheus"
"go.etcd.io/etcd/raft/v3/raftpb"
)

Expand Down Expand Up @@ -1752,7 +1753,7 @@ type StoreMetrics struct {
RaftCommandCommitLatency *metric.Histogram
RaftHandleReadyLatency *metric.Histogram
RaftApplyCommittedLatency *metric.Histogram
RaftSchedulerLatency *metric.Histogram
RaftSchedulerLatency *metric.HistogramV2
RaftTimeoutCampaign *metric.Counter

// Raft message metrics.
Expand Down Expand Up @@ -2255,8 +2256,10 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RaftCommandCommitLatency: metric.NewLatency(metaRaftCommandCommitLatency, histogramWindow),
RaftHandleReadyLatency: metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow),
RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow),
RaftSchedulerLatency: metric.NewLatency(metaRaftSchedulerLatency, histogramWindow),
RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign),
RaftSchedulerLatency: metric.NewHistogramV2(metaRaftSchedulerLatency, histogramWindow, prometheus.HistogramOpts{
Buckets: metric.IOLatencyBuckets,
}),
RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign),

// Raft message metrics.
RaftRcvdMessages: [...]*metric.Counter{
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ type raftScheduleState struct {
type raftScheduler struct {
ambientContext log.AmbientContext
processor raftProcessor
latency *metric.Histogram
latency *metric.HistogramV2
numWorkers int

mu struct {
Expand Down
2 changes: 2 additions & 0 deletions pkg/server/status/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ go_library(
"@com_github_codahale_hdrhistogram//:hdrhistogram",
"@com_github_dustin_go_humanize//:go-humanize",
"@com_github_elastic_gosigar//:gosigar",
"@com_github_prometheus_client_golang//prometheus",
"@com_github_prometheus_client_model//go",
"@com_github_shirou_gopsutil_v3//net",
] + select({
"@io_bazel_rules_go//go/platform:aix": [
Expand Down
31 changes: 22 additions & 9 deletions pkg/server/status/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/system"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
"github.com/codahale/hdrhistogram"
humanize "github.com/dustin/go-humanize"
"github.com/dustin/go-humanize"
"github.com/elastic/gosigar"
prometheusgo "github.com/prometheus/client_model/go"
)

const (
Expand Down Expand Up @@ -531,12 +531,10 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error
// TODO(tschottdorf,ajwerner): consider moving this switch to a single
// interface implemented by the individual metric types.
type (
float64Valuer interface{ Value() float64 }
int64Valuer interface{ Value() int64 }
int64Counter interface{ Count() int64 }
histogramValuer interface {
Windowed() (*hdrhistogram.Histogram, time.Duration)
}
float64Valuer interface{ Value() float64 }
int64Valuer interface{ Value() int64 }
int64Counter interface{ Count() int64 }
prometheusMetricValuer interface{ ToPrometheusMetric() *prometheusgo.Metric }
)
switch mtr := mtr.(type) {
case float64:
Expand All @@ -547,7 +545,7 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error
fn(name, float64(mtr.Value()))
case int64Counter:
fn(name, float64(mtr.Count()))
case histogramValuer:
case *metric.Histogram:
// TODO(mrtracy): Where should this comment go for better
// visibility?
//
Expand All @@ -567,6 +565,21 @@ func extractValue(name string, mtr interface{}, fn func(string, float64)) error
fn(name+pt.suffix, float64(curr.ValueAtQuantile(pt.quantile)))
}
fn(name+"-count", float64(curr.TotalCount()))
case *metric.HistogramV2:
// NB: this branch is intentionally at the bottom since all metrics implement it.
cur := mtr.Windowed()
var m prometheusgo.Metric
_ = cur.Write(&m)
hist := m.Histogram
n := float64(*hist.SampleCount)
fn(name+"-count", n)
fn(name+"-avg", *hist.SampleSum/n)
// TODO(obs-inf): add quantiles like for the hdrhistogram.
case prometheusMetricValuer:
// TODO we should be able to handle all non-histogram branches using this, i.e.
// can delete the float, int, etc, cases above.
_ = mtr.ToPrometheusMetric()

default:
return errors.Errorf("cannot extract value for type %T", mtr)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/metric/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ go_library(
"registry.go",
"rule.go",
"rule_registry.go",
"sliding_histogram.go",
"test_helpers.go",
"tick_helper.go",
],
embed = [":metric_go_proto"],
importpath = "github.com/cockroachdb/cockroach/pkg/util/metric",
Expand Down
176 changes: 168 additions & 8 deletions pkg/util/metric/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/codahale/hdrhistogram"
"github.com/gogo/protobuf/proto"
"github.com/prometheus/client_golang/prometheus"
prometheusgo "github.com/prometheus/client_model/go"
metrics "github.com/rcrowley/go-metrics"
)
Expand Down Expand Up @@ -185,7 +186,8 @@ type Histogram struct {
mu struct {
syncutil.Mutex
cumulative *hdrhistogram.Histogram
sliding *slidingHistogram
*tickHelper
sliding *hdrhistogram.WindowedHistogram
}
}

Expand All @@ -194,13 +196,20 @@ type Histogram struct {
// track nonnegative values up to 'maxVal' with 'sigFigs' decimal points of
// precision.
func NewHistogram(metadata Metadata, duration time.Duration, maxVal int64, sigFigs int) *Histogram {
dHist := newSlidingHistogram(duration, maxVal, sigFigs)
h := &Histogram{
Metadata: metadata,
maxVal: maxVal,
}
wHist := hdrhistogram.NewWindowed(histWrapNum, 0, maxVal, sigFigs)
h.mu.cumulative = hdrhistogram.New(0, maxVal, sigFigs)
h.mu.sliding = dHist
h.mu.sliding = wHist
h.mu.tickHelper = &tickHelper{
nextT: now(),
tickInterval: duration / histWrapNum,
onTick: func() {
wHist.Rotate()
},
}
return h
}

Expand All @@ -222,7 +231,9 @@ func NewLatency(metadata Metadata, histogramWindow time.Duration) *Histogram {
func (h *Histogram) Windowed() (*hdrhistogram.Histogram, time.Duration) {
h.mu.Lock()
defer h.mu.Unlock()
return cloneHistogram(h.mu.sliding.Current()), h.mu.sliding.duration
// TODO(obs-inf): not sure we should multiply by histWrapNum here, but it
// has been the behavior for a long time.
return cloneHistogram(h.mu.sliding.Merge()), histWrapNum * h.mu.tickInterval
}

// Snapshot returns a copy of the cumulative (i.e. all-time samples) histogram
Expand All @@ -240,8 +251,8 @@ func (h *Histogram) RecordValue(v int64) {
h.mu.Lock()
defer h.mu.Unlock()

if h.mu.sliding.RecordValue(v) != nil {
_ = h.mu.sliding.RecordValue(h.maxVal)
if h.mu.sliding.Current.RecordValue(v) != nil {
_ = h.mu.sliding.Current.RecordValue(h.maxVal)
}
if h.mu.cumulative.RecordValue(v) != nil {
_ = h.mu.cumulative.RecordValue(h.maxVal)
Expand All @@ -265,7 +276,7 @@ func (h *Histogram) Min() int64 {
// Inspect calls the closure with the empty string and the receiver.
func (h *Histogram) Inspect(f func(interface{})) {
h.mu.Lock()
maybeTick(h.mu.sliding)
maybeTick(h.mu.tickHelper)
h.mu.Unlock()
f(h)
}
Expand All @@ -280,7 +291,7 @@ func (h *Histogram) ToPrometheusMetric() *prometheusgo.Metric {
hist := &prometheusgo.Histogram{}

h.mu.Lock()
maybeTick(h.mu.sliding)
maybeTick(h.mu.tickHelper)
bars := h.mu.cumulative.Distribution()
hist.Bucket = make([]*prometheusgo.Bucket, 0, len(bars))

Expand Down Expand Up @@ -319,6 +330,155 @@ func (h *Histogram) GetMetadata() Metadata {
return baseMetadata
}

// IOLatencyBuckets are prometheus histogram buckets suitable for a histogram
// that records a quantity (nanosecond-denominated) in which most measurements
// resemble those of typical disk latencies, i.e. which are in the micro- and
// millisecond range during normal operation.
var IOLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/IOLatencyBuckets.
10000.000000, // 10µs
26826.957953, // 26.826µs
71968.567300, // 71.968µs
193069.772888, // 193.069µs
517947.467923, // 517.947µs
1389495.494373, // 1.389495ms
3727593.720315, // 3.727593ms
10000000.000000, // 9.999999ms
26826957.952797, // 26.826957ms
71968567.300115, // 71.968567ms
193069772.888325, // 193.069772ms
517947467.923120, // 517.947467ms
1389495494.373135, // 1.389495494s
3727593720.314933, // 3.72759372s
9999999999.999981, // 9.999999999s
}

// NetworkLatencyBuckets are prometheus histogram buckets suitable for a histogram
// that records a quantity (nanosecond-denominated) in which most measurements
// behave like network latencies, i.e. most measurements are in the ms to sub-second
// range during normal operation.
var NetworkLatencyBuckets = []float64{
// Generated via TestHistogramBuckets/NetworkLatencyBuckets.
500000.000000, // 500µs
860513.842995, // 860.513µs
1480968.147973, // 1.480968ms
2548787.184731, // 2.548787ms
4386533.310619, // 4.386533ms
7549345.273094, // 7.549345ms
12992632.226094, // 12.992632ms
22360679.774998, // 22.360679ms
38483348.970335, // 38.483348ms
66230909.027573, // 66.230909ms
113985228.104760, // 113.985228ms
196171733.362212, // 196.171733ms
337616984.325077, // 337.616984ms
581048177.284016, // 581.048177ms
999999999.999999, // 999.999999ms,
}

// NewHistogramV2 is a prometheus-backed histogram. Depending on the value of
// opts.Buckets, this is suitable for recording any kind of quantity. Common
// sensible choices are {IO,Network}LatencyBuckets.
func NewHistogramV2(
meta Metadata, windowDuration time.Duration, opts prometheus.HistogramOpts,
) *HistogramV2 {
// TODO(obs-inf): prometheus supports labeled histograms but they require more
// plumbing and don't fit into the PrometheusObservable interface any more.
cum := prometheus.NewHistogram(opts)
h := &HistogramV2{
Metadata: meta,
cum: cum,
}
h.windowed.tickHelper = &tickHelper{
nextT: now(),
tickInterval: windowDuration,
onTick: func() {
h.windowed.prev = h.windowed.cur
h.windowed.cur = prometheus.NewHistogram(opts)
},
}
h.windowed.tickHelper.onTick()
return h
}

var _ periodic = (*HistogramV2)(nil)
var _ PrometheusExportable = (*HistogramV2)(nil)

type HistogramV2 struct {
Metadata
cum prometheus.Histogram

// TODO(obs-inf): the way we implement windowed histograms is not great. If
// the windowed histogram is pulled right after a tick, it will be mostly
// empty. We could add a third bucket and represent the merged view of the two
// most recent buckets to avoid that. Or we could "just" double the rotation
// interval (so that the histogram really collects for 20s when we expect to
// persist the contents every 10s). Really it would make more sense to
// explicitly rotate the histogram atomically with collecting its contents,
// but that is now how we have set it up right now. It should be doable
// though, since there is only one consumer of windowed histograms - our
// internal timeseries system.
windowed struct {
// prometheus.Histogram is thread safe, so we only
// need an RLock to record into it. But write lock
// is held while rotating.
syncutil.RWMutex
*tickHelper
prev, cur prometheus.Histogram
}
}

func (h *HistogramV2) nextTick() time.Time {
h.windowed.RLock()
defer h.windowed.RUnlock()
return h.windowed.nextTick()
}

func (h *HistogramV2) tick() {
h.windowed.Lock()
defer h.windowed.Unlock()
h.windowed.tick()
}

func (h *HistogramV2) Windowed() prometheus.Histogram {
h.windowed.RLock()
defer h.windowed.RUnlock()
return h.windowed.cur
}

func (h *HistogramV2) RecordValue(n int64) {
v := float64(n)
h.cum.Observe(v)

h.windowed.RLock()
defer h.windowed.RUnlock()
h.windowed.cur.Observe(v)

}

func (h *HistogramV2) GetType() *prometheusgo.MetricType {
return prometheusgo.MetricType_HISTOGRAM.Enum()
}

func (h *HistogramV2) ToPrometheusMetric() *prometheusgo.Metric {
m := &prometheusgo.Metric{}
if err := h.cum.Write(m); err != nil {
panic(err) // TODD
}
return m
}

func (h *HistogramV2) GetMetadata() Metadata {
return h.Metadata
}

func (h *HistogramV2) Inspect(f func(interface{})) {
h.windowed.Lock()
maybeTick(&h.windowed)
h.windowed.Unlock()
f(h)
}

// A Counter holds a single mutable atomic value.
type Counter struct {
Metadata
Expand Down
Loading

0 comments on commit eb82a43

Please sign in to comment.