diff --git a/pkg/kv/kvprober/kvprober.go b/pkg/kv/kvprober/kvprober.go index c8a126d22939..c1a5bbf3a577 100644 --- a/pkg/kv/kvprober/kvprober.go +++ b/pkg/kv/kvprober/kvprober.go @@ -39,10 +39,9 @@ type Prober struct { settings *cluster.Settings // planner is an interface for selecting a range to probe. planner planner - - // Metrics wraps up the set of prometheus metrics that the prober sets. The + // metrics wraps up the set of prometheus metrics that the prober sets; the // goal of the prober IS to populate these metrics. - Metrics Metrics + metrics Metrics } // Opts provides knobs to control kvprober.Prober. @@ -113,7 +112,7 @@ func NewProber(opts Opts) *Prober { settings: opts.Settings, planner: newMeta2Planner(opts.DB, opts.Settings), - Metrics: Metrics{ + metrics: Metrics{ ReadProbeAttempts: metric.NewCounter(metaReadProbeAttempts), ReadProbeFailures: metric.NewCounter(metaReadProbeFailures), ReadProbeLatency: metric.NewLatency(metaReadProbeLatency, opts.HistogramWindowInterval), @@ -123,6 +122,11 @@ func NewProber(opts Opts) *Prober { } } +// Metrics returns a struct which contains the kvprober metrics. +func (p *Prober) Metrics() Metrics { + return p.metrics +} + // Start causes kvprober to start probing KV. Start returns immediately. Start // returns an error only if stopper.RunAsyncTask returns an error. func (p *Prober) Start(ctx context.Context, stopper *stop.Stopper) error { @@ -170,12 +174,12 @@ func (p *Prober) probe(ctx context.Context, db dbGet) { return } - p.Metrics.ProbePlanAttempts.Inc(1) + p.metrics.ProbePlanAttempts.Inc(1) step, err := p.planner.next(ctx) if err != nil { log.Health.Errorf(ctx, "can't make a plan: %v", err) - p.Metrics.ProbePlanFailures.Inc(1) + p.metrics.ProbePlanFailures.Inc(1) return } @@ -186,7 +190,7 @@ func (p *Prober) probe(ctx context.Context, db dbGet) { // ProbePlanFailures. This would probably be a ticket alerting as // the impact is more low visibility into possible failures than a high // impact production issue. - p.Metrics.ReadProbeAttempts.Inc(1) + p.metrics.ReadProbeAttempts.Inc(1) start := timeutil.Now() @@ -204,7 +208,7 @@ func (p *Prober) probe(ctx context.Context, db dbGet) { if err != nil { // TODO(josh): Write structured events with log.Structured. log.Health.Errorf(ctx, "kv.Get(%s), r=%v failed with: %v", step.StartKey, step.RangeID, err) - p.Metrics.ReadProbeFailures.Inc(1) + p.metrics.ReadProbeFailures.Inc(1) return } @@ -212,7 +216,7 @@ func (p *Prober) probe(ctx context.Context, db dbGet) { log.Health.Infof(ctx, "kv.Get(%s), r=%v returned success in %v", step.StartKey, step.RangeID, d) // Latency of failures is not recorded. They are counted as failures tho. - p.Metrics.ReadProbeLatency.RecordValue(d.Nanoseconds()) + p.metrics.ReadProbeLatency.RecordValue(d.Nanoseconds()) } // Returns a random duration pulled from the uniform distribution given below: diff --git a/pkg/kv/kvprober/kvprober_integration_test.go b/pkg/kv/kvprober/kvprober_integration_test.go index 4341fd24216e..433da035ba41 100644 --- a/pkg/kv/kvprober/kvprober_integration_test.go +++ b/pkg/kv/kvprober/kvprober_integration_test.go @@ -53,8 +53,8 @@ func TestProberDoesReads(t *testing.T) { time.Sleep(100 * time.Millisecond) - require.Zero(t, p.Metrics.ProbePlanAttempts.Count()) - require.Zero(t, p.Metrics.ReadProbeAttempts.Count()) + require.Zero(t, p.Metrics().ProbePlanAttempts.Count()) + require.Zero(t, p.Metrics().ReadProbeAttempts.Count()) }) t.Run("happy path", func(t *testing.T) { @@ -67,13 +67,13 @@ func TestProberDoesReads(t *testing.T) { require.NoError(t, p.Start(ctx, s.Stopper())) testutils.SucceedsSoon(t, func() error { - if p.Metrics.ReadProbeAttempts.Count() < int64(50) { - return errors.Newf("probe count too low: %v", p.Metrics.ReadProbeAttempts.Count()) + if p.Metrics().ReadProbeAttempts.Count() < int64(50) { + return errors.Newf("probe count too low: %v", p.Metrics().ReadProbeAttempts.Count()) } return nil }) - require.Zero(t, p.Metrics.ReadProbeFailures.Count()) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) + require.Zero(t, p.Metrics().ReadProbeFailures.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) }) t.Run("a single range is unavailable", func(t *testing.T) { @@ -101,12 +101,12 @@ func TestProberDoesReads(t *testing.T) { // TODO(josh): Once structured logging is in, can check that failures // involved only the time-series range. testutils.SucceedsSoon(t, func() error { - if p.Metrics.ReadProbeFailures.Count() < int64(2) { - return errors.Newf("error count too low: %v", p.Metrics.ReadProbeFailures.Count()) + if p.Metrics().ReadProbeFailures.Count() < int64(2) { + return errors.Newf("error count too low: %v", p.Metrics().ReadProbeFailures.Count()) } return nil }) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) }) t.Run("all ranges are unavailable for Gets", func(t *testing.T) { @@ -148,9 +148,9 @@ func TestProberDoesReads(t *testing.T) { } // Expect all probes to fail but planning to succeed. - require.Equal(t, int64(10), p.Metrics.ReadProbeAttempts.Count()) - require.Equal(t, int64(10), p.Metrics.ReadProbeFailures.Count()) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) + require.Equal(t, int64(10), p.Metrics().ReadProbeAttempts.Count()) + require.Equal(t, int64(10), p.Metrics().ReadProbeFailures.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) }) } diff --git a/pkg/kv/kvprober/kvprober_test.go b/pkg/kv/kvprober/kvprober_test.go index 1f79691b5634..fbcfc8ed9766 100644 --- a/pkg/kv/kvprober/kvprober_test.go +++ b/pkg/kv/kvprober/kvprober_test.go @@ -40,10 +40,10 @@ func TestProbe(t *testing.T) { p.probe(ctx, m) - require.Zero(t, p.Metrics.ProbePlanAttempts.Count()) - require.Zero(t, p.Metrics.ReadProbeAttempts.Count()) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) - require.Zero(t, p.Metrics.ReadProbeFailures.Count()) + require.Zero(t, p.Metrics().ProbePlanAttempts.Count()) + require.Zero(t, p.Metrics().ReadProbeAttempts.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) + require.Zero(t, p.Metrics().ReadProbeFailures.Count()) }) t.Run("happy path", func(t *testing.T) { @@ -53,10 +53,10 @@ func TestProbe(t *testing.T) { p.probe(ctx, m) - require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count()) - require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count()) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) - require.Zero(t, p.Metrics.ReadProbeFailures.Count()) + require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count()) + require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) + require.Zero(t, p.Metrics().ReadProbeFailures.Count()) }) t.Run("planning fails", func(t *testing.T) { @@ -70,10 +70,10 @@ func TestProbe(t *testing.T) { p.probe(ctx, m) - require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count()) - require.Zero(t, p.Metrics.ReadProbeAttempts.Count()) - require.Equal(t, int64(1), p.Metrics.ProbePlanFailures.Count()) - require.Zero(t, p.Metrics.ReadProbeFailures.Count()) + require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count()) + require.Zero(t, p.Metrics().ReadProbeAttempts.Count()) + require.Equal(t, int64(1), p.Metrics().ProbePlanFailures.Count()) + require.Zero(t, p.Metrics().ReadProbeFailures.Count()) }) t.Run("get fails", func(t *testing.T) { @@ -86,10 +86,10 @@ func TestProbe(t *testing.T) { p.probe(ctx, m) - require.Equal(t, int64(1), p.Metrics.ProbePlanAttempts.Count()) - require.Equal(t, int64(1), p.Metrics.ReadProbeAttempts.Count()) - require.Zero(t, p.Metrics.ProbePlanFailures.Count()) - require.Equal(t, int64(1), p.Metrics.ReadProbeFailures.Count()) + require.Equal(t, int64(1), p.Metrics().ProbePlanAttempts.Count()) + require.Equal(t, int64(1), p.Metrics().ReadProbeAttempts.Count()) + require.Zero(t, p.Metrics().ProbePlanFailures.Count()) + require.Equal(t, int64(1), p.Metrics().ReadProbeFailures.Count()) }) } diff --git a/pkg/server/server.go b/pkg/server/server.go index 87fcc0f5c9a8..449b45712201 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -652,6 +652,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) { Settings: st, HistogramWindowInterval: cfg.HistogramWindowInterval(), }) + registry.AddMetricStruct(kvProber.Metrics()) sqlServer, err := newSQLServer(ctx, sqlServerArgs{ sqlServerOptionalKVArgs: sqlServerOptionalKVArgs{ diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 6fd3baacec13..fddaf694b964 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -575,6 +575,26 @@ var charts = []sectionDescription{ }, }, }, + { + Organization: [][]string{{KVTransactionLayer, "Prober"}}, Charts: []chartDescription{ + { + Title: "Availability", + Metrics: []string{ + "kv.prober.planning_attempts", + "kv.prober.planning_failures", + "kv.prober.read.attempts", + "kv.prober.read.failures", + }, + AxisLabel: "Probes", + }, + { + Title: "Latency", + Metrics: []string{ + "kv.prober.read.latency", + }, + }, + }, + }, { Organization: [][]string{ {KVTransactionLayer, "Clocks"},