Skip to content

Commit

Permalink
kvprober: add metrics to the registry & catalog
Browse files Browse the repository at this point in the history
This commit adds the kvprober metrics to the registry so they are
exported at the prometheus endpoint and tracked in CRDB's time-series
DB. This commit also adds the kvprober metrics to the catalog since
that is required by a unit test.

Release justification: Auxiliary system that is off by default.
Release note: None.
  • Loading branch information
joshimhoff committed Mar 5, 2021
1 parent 46dd319 commit 0dd1e55
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 37 deletions.
22 changes: 13 additions & 9 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ type Prober struct {
settings *cluster.Settings
// planner is an interface for selecting a range to probe.
planner planner

// Metrics wraps up the set of prometheus metrics that the prober sets. The
// metrics wraps up the set of prometheus metrics that the prober sets; the
// goal of the prober IS to populate these metrics.
Metrics Metrics
metrics Metrics
}

// Opts provides knobs to control kvprober.Prober.
Expand Down Expand Up @@ -113,7 +112,7 @@ func NewProber(opts Opts) *Prober {
settings: opts.Settings,

planner: newMeta2Planner(opts.DB, opts.Settings),
Metrics: Metrics{
metrics: Metrics{
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval),
Expand All @@ -123,6 +122,11 @@ func NewProber(opts Opts) *Prober {
}
}

// Metrics returns a struct which contains the kvprober metrics.
func (p *Prober) Metrics() Metrics {
return p.metrics
}

// Start causes kvprober to start probing KV. Start returns immediately. Start
// returns an error only if stopper.RunAsyncTask returns an error.
func (p *Prober) Start(ctx context.Context, stopper *stop.Stopper) error {
Expand Down Expand Up @@ -170,12 +174,12 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
return
}

p.Metrics.ProbePlanAttempts.Inc(1)
p.metrics.ProbePlanAttempts.Inc(1)

step, err := p.planner.next(ctx)
if err != nil {
log.Health.Errorf(ctx, "can't make a plan: %v", err)
p.Metrics.ProbePlanFailures.Inc(1)
p.metrics.ProbePlanFailures.Inc(1)
return
}

Expand All @@ -186,7 +190,7 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
// ProbePlanFailures. This would probably be a ticket alerting as
// the impact is more low visibility into possible failures than a high
// impact production issue.
p.Metrics.ReadProbeAttempts.Inc(1)
p.metrics.ReadProbeAttempts.Inc(1)

start := timeutil.Now()

Expand All @@ -204,15 +208,15 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
if err != nil {
// TODO(josh): Write structured events with log.Structured.
log.Health.Errorf(ctx, "kv.Get(%s), r=%v failed with: %v", step.StartKey, step.RangeID, err)
p.Metrics.ReadProbeFailures.Inc(1)
p.metrics.ReadProbeFailures.Inc(1)
return
}

d := timeutil.Since(start)
log.Health.Infof(ctx, "kv.Get(%s), r=%v returned success in %v", step.StartKey, step.RangeID, d)

// Latency of failures is not recorded. They are counted as failures tho.
p.Metrics.ReadProbeLatency.RecordValue(d.Nanoseconds())
p.metrics.ReadProbeLatency.RecordValue(d.Nanoseconds())
}

// Returns a random duration pulled from the uniform distribution given below:
Expand Down
24 changes: 12 additions & 12 deletions pkg/kv/kvprober/kvprober_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ func TestProberDoesReads(t *testing.T) {

time.Sleep(100 * time.Millisecond)

require.Zero(t, p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
})

t.Run("happy path", func(t *testing.T) {
Expand All @@ -67,13 +67,13 @@ func TestProberDoesReads(t *testing.T) {
require.NoError(t, p.Start(ctx, s.Stopper()))

testutils.SucceedsSoon(t, func() error {
if p.Metrics.ReadProbeAttempts.Count() < int64(50) {
return errors.Newf("probe count too low: %v", p.Metrics.ReadProbeAttempts.Count())
if p.Metrics().ReadProbeAttempts.Count() < int64(50) {
return errors.Newf("probe count too low: %v", p.Metrics().ReadProbeAttempts.Count())
}
return nil
})
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})

t.Run("a single range is unavailable", func(t *testing.T) {
Expand Down Expand Up @@ -101,12 +101,12 @@ func TestProberDoesReads(t *testing.T) {
// TODO(josh): Once structured logging is in, can check that failures
// involved only the time-series range.
testutils.SucceedsSoon(t, func() error {
if p.Metrics.ReadProbeFailures.Count() < int64(2) {
return errors.Newf("error count too low: %v", p.Metrics.ReadProbeFailures.Count())
if p.Metrics().ReadProbeFailures.Count() < int64(2) {
return errors.Newf("error count too low: %v", p.Metrics().ReadProbeFailures.Count())
}
return nil
})
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})

t.Run("all ranges are unavailable for Gets", func(t *testing.T) {
Expand Down Expand Up @@ -148,9 +148,9 @@ func TestProberDoesReads(t *testing.T) {
}

// Expect all probes to fail but planning to succeed.
require.Equal(t, int64(10), p.Metrics.ReadProbeAttempts.Count())
require.Equal(t, int64(10), p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Equal(t, int64(10), p.Metrics().ReadProbeAttempts.Count())
require.Equal(t, int64(10), p.Metrics().ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})
}

Expand Down
32 changes: 16 additions & 16 deletions pkg/kv/kvprober/kvprober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Zero(t, p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("happy path", func(t *testing.T) {
Expand All @@ -53,10 +53,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("planning fails", func(t *testing.T) {
Expand All @@ -70,10 +70,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("get fails", func(t *testing.T) {
Expand All @@ -86,10 +86,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeFailures.Count())
})
}

Expand Down
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
Settings: st,
HistogramWindowInterval: cfg.HistogramWindowInterval(),
})
registry.AddMetricStruct(kvProber.Metrics())

sqlServer, err := newSQLServer(ctx, sqlServerArgs{
sqlServerOptionalKVArgs: sqlServerOptionalKVArgs{
Expand Down
20 changes: 20 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,26 @@ var charts = []sectionDescription{
},
},
},
{
Organization: [][]string{{KVTransactionLayer, "Prober"}}, Charts: []chartDescription{
{
Title: "Availability",
Metrics: []string{
"kv.prober.planning_attempts",
"kv.prober.planning_failures",
"kv.prober.read.attempts",
"kv.prober.read.failures",
},
AxisLabel: "Probes",
},
{
Title: "Latency",
Metrics: []string{
"kv.prober.read.latency",
},
},
},
},
{
Organization: [][]string{
{KVTransactionLayer, "Clocks"},
Expand Down

0 comments on commit 0dd1e55

Please sign in to comment.