From 0b7ba562c604d69ed048d86445ba0f27498982cb Mon Sep 17 00:00:00 2001 From: Lidor Carmel Date: Fri, 19 Nov 2021 10:31:07 -0800 Subject: [PATCH] kv: add telemetry for node liveness Exporting the existing metrics 'HeartbeatFailures' and 'EpochIncrements' as telementry counters. These telemetry values can be seen in cockroach demo by decommissioning a node and then querying crdb_internal.feature_usage, and also in the 'Diagnostics Reporting Data' page (/_status/diagnostics/local -> "featureUsage"). Issue #71662 Release note: None --- pkg/kv/kvserver/liveness/BUILD.bazel | 1 + pkg/kv/kvserver/liveness/liveness.go | 13 +++++++------ pkg/server/telemetry/BUILD.bazel | 1 + pkg/server/telemetry/features.go | 7 +++++++ pkg/server/telemetry/features_test.go | 21 +++++++++++++++++++++ 5 files changed, 37 insertions(+), 6 deletions(-) diff --git a/pkg/kv/kvserver/liveness/BUILD.bazel b/pkg/kv/kvserver/liveness/BUILD.bazel index 92e2a790a5b5..1fa6121d1d69 100644 --- a/pkg/kv/kvserver/liveness/BUILD.bazel +++ b/pkg/kv/kvserver/liveness/BUILD.bazel @@ -12,6 +12,7 @@ go_library( "//pkg/kv", "//pkg/kv/kvserver/liveness/livenesspb", "//pkg/roachpb:with-mocks", + "//pkg/server/telemetry", "//pkg/settings/cluster", "//pkg/storage", "//pkg/util/contextutil", diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index db755c8cb137..08226bb9b97d 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util/contextutil" @@ -139,8 +140,8 @@ type Metrics struct { LiveNodes *metric.Gauge HeartbeatsInFlight *metric.Gauge HeartbeatSuccesses *metric.Counter - HeartbeatFailures *metric.Counter - EpochIncrements *metric.Counter + HeartbeatFailures telemetry.CounterWithMetric + EpochIncrements telemetry.CounterWithMetric HeartbeatLatency *metric.Histogram } @@ -295,8 +296,8 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness { LiveNodes: metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes), HeartbeatsInFlight: metric.NewGauge(metaHeartbeatsInFlight), HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses), - HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures), - EpochIncrements: metric.NewCounter(metaEpochIncrements), + HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures), + EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements), HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, opts.HistogramWindowInterval), } nl.mu.nodes = make(map[roachpb.NodeID]Record) @@ -978,7 +979,7 @@ func (nl *NodeLiveness) heartbeatInternal( nl.metrics.HeartbeatSuccesses.Inc(1) return nil } - nl.metrics.HeartbeatFailures.Inc(1) + nl.metrics.HeartbeatFailures.Inc() return err } @@ -1211,7 +1212,7 @@ func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness livenesspb. log.Infof(ctx, "incremented n%d liveness epoch to %d", written.NodeID, written.Epoch) nl.maybeUpdate(ctx, written) - nl.metrics.EpochIncrements.Inc(1) + nl.metrics.EpochIncrements.Inc() return nil } diff --git a/pkg/server/telemetry/BUILD.bazel b/pkg/server/telemetry/BUILD.bazel index ba02d72a9546..77474e81219d 100644 --- a/pkg/server/telemetry/BUILD.bazel +++ b/pkg/server/telemetry/BUILD.bazel @@ -23,6 +23,7 @@ go_test( srcs = ["features_test.go"], deps = [ ":telemetry", + "//pkg/util/metric", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/server/telemetry/features.go b/pkg/server/telemetry/features.go index ce704965c902..d1963a339673 100644 --- a/pkg/server/telemetry/features.go +++ b/pkg/server/telemetry/features.go @@ -128,6 +128,13 @@ func (c CounterWithMetric) Inc() { c.metric.Inc(1) } +// Count returns the value of the metric, not the telemetry. Note that the +// telemetry value may reset to zero when, for example, GetFeatureCounts() is +// called with ResetCounts to generate a report. +func (c CounterWithMetric) Count() int64 { + return c.metric.Count() +} + // Forward the metric.Iterable interface to the metric counter. We // don't just embed the counter because our Inc() interface is a bit // different. diff --git a/pkg/server/telemetry/features_test.go b/pkg/server/telemetry/features_test.go index fe0f12787880..77d83a1bb20c 100644 --- a/pkg/server/telemetry/features_test.go +++ b/pkg/server/telemetry/features_test.go @@ -16,6 +16,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/server/telemetry" + "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/stretchr/testify/require" ) @@ -91,3 +92,23 @@ func TestBucket(t *testing.T) { } } } + +// TestCounterWithMetric verifies that only the telemetry is reset to zero when, +// for example, a report is created. +func TestCounterWithMetric(t *testing.T) { + cm := telemetry.NewCounterWithMetric(metric.Metadata{Name: "test-metric"}) + cm.Inc() + + // Using GetFeatureCounts to read the telemetry value. + m1 := telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ReadOnly) + require.Equal(t, int32(1), m1["test-metric"]) + require.Equal(t, int64(1), cm.Count()) + + // Reset the telemetry. + telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ResetCounts) + + // Verify only the telemetry is back to 0. + m2 := telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ReadOnly) + require.Equal(t, int32(0), m2["test-metric"]) + require.Equal(t, int64(1), cm.Count()) +}