Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
61524: kvprober: add metrics to the registry & catalog r=tbg a=joshimhoff

cockroachdb#61074

**kvprober: add metrics to the registry & catalog**

This commit adds the kvprober metrics to the registry so they are
exported at the prometheus endpoint and tracked in CRDB's time-series
DB. This commit also adds the kvprober metrics to the catalog since
that is required by a unit test.

Release justification: Auxiliary system that is off by default.
Release note: None.

Co-authored-by: Josh Imhoff <[email protected]>
  • Loading branch information
craig[bot] and joshimhoff committed Mar 5, 2021
2 parents a77bf44 + 0dd1e55 commit 05a7bec
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 37 deletions.
22 changes: 13 additions & 9 deletions pkg/kv/kvprober/kvprober.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@ type Prober struct {
settings *cluster.Settings
// planner is an interface for selecting a range to probe.
planner planner

// Metrics wraps up the set of prometheus metrics that the prober sets. The
// metrics wraps up the set of prometheus metrics that the prober sets; the
// goal of the prober IS to populate these metrics.
Metrics Metrics
metrics Metrics
}

// Opts provides knobs to control kvprober.Prober.
Expand Down Expand Up @@ -113,7 +112,7 @@ func NewProber(opts Opts) *Prober {
settings: opts.Settings,

planner: newMeta2Planner(opts.DB, opts.Settings),
Metrics: Metrics{
metrics: Metrics{
ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts),
ReadProbeFailures: metric.NewCounter(metaReadProbeFailures),
ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval),
Expand All @@ -123,6 +122,11 @@ func NewProber(opts Opts) *Prober {
}
}

// Metrics returns a struct which contains the kvprober metrics.
func (p *Prober) Metrics() Metrics {
return p.metrics
}

// Start causes kvprober to start probing KV. Start returns immediately. Start
// returns an error only if stopper.RunAsyncTask returns an error.
func (p *Prober) Start(ctx context.Context, stopper *stop.Stopper) error {
Expand Down Expand Up @@ -170,12 +174,12 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
return
}

p.Metrics.ProbePlanAttempts.Inc(1)
p.metrics.ProbePlanAttempts.Inc(1)

step, err := p.planner.next(ctx)
if err != nil {
log.Health.Errorf(ctx, "can't make a plan: %v", err)
p.Metrics.ProbePlanFailures.Inc(1)
p.metrics.ProbePlanFailures.Inc(1)
return
}

Expand All @@ -186,7 +190,7 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
// ProbePlanFailures. This would probably be a ticket alerting as
// the impact is more low visibility into possible failures than a high
// impact production issue.
p.Metrics.ReadProbeAttempts.Inc(1)
p.metrics.ReadProbeAttempts.Inc(1)

start := timeutil.Now()

Expand All @@ -204,15 +208,15 @@ func (p *Prober) probe(ctx context.Context, db dbGet) {
if err != nil {
// TODO(josh): Write structured events with log.Structured.
log.Health.Errorf(ctx, "kv.Get(%s), r=%v failed with: %v", step.StartKey, step.RangeID, err)
p.Metrics.ReadProbeFailures.Inc(1)
p.metrics.ReadProbeFailures.Inc(1)
return
}

d := timeutil.Since(start)
log.Health.Infof(ctx, "kv.Get(%s), r=%v returned success in %v", step.StartKey, step.RangeID, d)

// Latency of failures is not recorded. They are counted as failures tho.
p.Metrics.ReadProbeLatency.RecordValue(d.Nanoseconds())
p.metrics.ReadProbeLatency.RecordValue(d.Nanoseconds())
}

// Returns a random duration pulled from the uniform distribution given below:
Expand Down
24 changes: 12 additions & 12 deletions pkg/kv/kvprober/kvprober_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ func TestProberDoesReads(t *testing.T) {

time.Sleep(100 * time.Millisecond)

require.Zero(t, p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
})

t.Run("happy path", func(t *testing.T) {
Expand All @@ -67,13 +67,13 @@ func TestProberDoesReads(t *testing.T) {
require.NoError(t, p.Start(ctx, s.Stopper()))

testutils.SucceedsSoon(t, func() error {
if p.Metrics.ReadProbeAttempts.Count() < int64(50) {
return errors.Newf("probe count too low: %v", p.Metrics.ReadProbeAttempts.Count())
if p.Metrics().ReadProbeAttempts.Count() < int64(50) {
return errors.Newf("probe count too low: %v", p.Metrics().ReadProbeAttempts.Count())
}
return nil
})
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})

t.Run("a single range is unavailable", func(t *testing.T) {
Expand Down Expand Up @@ -101,12 +101,12 @@ func TestProberDoesReads(t *testing.T) {
// TODO(josh): Once structured logging is in, can check that failures
// involved only the time-series range.
testutils.SucceedsSoon(t, func() error {
if p.Metrics.ReadProbeFailures.Count() < int64(2) {
return errors.Newf("error count too low: %v", p.Metrics.ReadProbeFailures.Count())
if p.Metrics().ReadProbeFailures.Count() < int64(2) {
return errors.Newf("error count too low: %v", p.Metrics().ReadProbeFailures.Count())
}
return nil
})
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})

t.Run("all ranges are unavailable for Gets", func(t *testing.T) {
Expand Down Expand Up @@ -148,9 +148,9 @@ func TestProberDoesReads(t *testing.T) {
}

// Expect all probes to fail but planning to succeed.
require.Equal(t, int64(10), p.Metrics.ReadProbeAttempts.Count())
require.Equal(t, int64(10), p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Equal(t, int64(10), p.Metrics().ReadProbeAttempts.Count())
require.Equal(t, int64(10), p.Metrics().ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
})
}

Expand Down
32 changes: 16 additions & 16 deletions pkg/kv/kvprober/kvprober_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Zero(t, p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Zero(t, p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("happy path", func(t *testing.T) {
Expand All @@ -53,10 +53,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("planning fails", func(t *testing.T) {
Expand All @@ -70,10 +70,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Zero(t, p.Metrics.ReadProbeAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ProbePlanFailures.Count())
require.Zero(t, p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Zero(t, p.Metrics().ReadProbeAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanFailures.Count())
require.Zero(t, p.Metrics().ReadProbeFailures.Count())
})

t.Run("get fails", func(t *testing.T) {
Expand All @@ -86,10 +86,10 @@ func TestProbe(t *testing.T) {

p.probe(ctx, m)

require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count())
require.Zero(t, p.Metrics.ProbePlanFailures.Count())
require.Equal(t, int64(1), p.Metrics.ReadProbeFailures.Count())
require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count())
require.Zero(t, p.Metrics().ProbePlanFailures.Count())
require.Equal(t, int64(1), p.Metrics().ReadProbeFailures.Count())
})
}

Expand Down
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
Settings: st,
HistogramWindowInterval: cfg.HistogramWindowInterval(),
})
registry.AddMetricStruct(kvProber.Metrics())

sqlServer, err := newSQLServer(ctx, sqlServerArgs{
sqlServerOptionalKVArgs: sqlServerOptionalKVArgs{
Expand Down
20 changes: 20 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,26 @@ var charts = []sectionDescription{
},
},
},
{
Organization: [][]string{{KVTransactionLayer, "Prober"}}, Charts: []chartDescription{
{
Title: "Availability",
Metrics: []string{
"kv.prober.planning_attempts",
"kv.prober.planning_failures",
"kv.prober.read.attempts",
"kv.prober.read.failures",
},
AxisLabel: "Probes",
},
{
Title: "Latency",
Metrics: []string{
"kv.prober.read.latency",
},
},
},
},
{
Organization: [][]string{
{KVTransactionLayer, "Clocks"},
Expand Down

0 comments on commit 05a7bec

Please sign in to comment.