Skip to content

Commit

Permalink
kv: add telemetry for node liveness
Browse files Browse the repository at this point in the history
Exporting the existing metrics 'HeartbeatFailures' and 'EpochIncrements'
as telementry counters.

These telemetry values can be seen in cockroach demo by decommissioning
a node and then querying crdb_internal.feature_usage, and also in the
'Diagnostics Reporting Data' page (/_status/diagnostics/local ->
"featureUsage").

Issue #71662

Release note: None
  • Loading branch information
lidorcarmel committed Nov 20, 2021
1 parent 28bb1ea commit a86b686
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 7 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/liveness/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ go_library(
"//pkg/kv",
"//pkg/kv/kvserver/liveness/livenesspb",
"//pkg/roachpb:with-mocks",
"//pkg/server/telemetry",
"//pkg/settings/cluster",
"//pkg/storage",
"//pkg/util/contextutil",
Expand Down
13 changes: 7 additions & 6 deletions pkg/kv/kvserver/liveness/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
Expand Down Expand Up @@ -139,8 +140,8 @@ type Metrics struct {
LiveNodes *metric.Gauge
HeartbeatsInFlight *metric.Gauge
HeartbeatSuccesses *metric.Counter
HeartbeatFailures *metric.Counter
EpochIncrements *metric.Counter
HeartbeatFailures telemetry.CounterWithMetric
EpochIncrements telemetry.CounterWithMetric
HeartbeatLatency *metric.Histogram
}

Expand Down Expand Up @@ -295,8 +296,8 @@ func NewNodeLiveness(opts NodeLivenessOptions) *NodeLiveness {
LiveNodes: metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes),
HeartbeatsInFlight: metric.NewGauge(metaHeartbeatsInFlight),
HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures),
EpochIncrements: metric.NewCounter(metaEpochIncrements),
HeartbeatFailures: telemetry.NewCounterWithMetric(metaHeartbeatFailures),
EpochIncrements: telemetry.NewCounterWithMetric(metaEpochIncrements),
HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, opts.HistogramWindowInterval),
}
nl.mu.nodes = make(map[roachpb.NodeID]Record)
Expand Down Expand Up @@ -978,7 +979,7 @@ func (nl *NodeLiveness) heartbeatInternal(
nl.metrics.HeartbeatSuccesses.Inc(1)
return nil
}
nl.metrics.HeartbeatFailures.Inc(1)
nl.metrics.HeartbeatFailures.Inc()
return err
}

Expand Down Expand Up @@ -1211,7 +1212,7 @@ func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness livenesspb.

log.Infof(ctx, "incremented n%d liveness epoch to %d", written.NodeID, written.Epoch)
nl.maybeUpdate(ctx, written)
nl.metrics.EpochIncrements.Inc(1)
nl.metrics.EpochIncrements.Inc()
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ func TestNodeLivenessEpochIncrement(t *testing.T) {
})

// Verify epoch increment metric count.
if c := tc.Servers[0].NodeLiveness().(*liveness.NodeLiveness).Metrics().EpochIncrements.Count(); c != 1 {
if c := tc.Servers[0].NodeLiveness().(*liveness.NodeLiveness).Metrics().EpochIncrements.MetricValue(); c != 1 {
t.Errorf("expected epoch increment == 1; got %d", c)
}

Expand Down
1 change: 1 addition & 0 deletions pkg/server/telemetry/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ go_test(
srcs = ["features_test.go"],
deps = [
":telemetry",
"//pkg/util/metric",
"@com_github_stretchr_testify//require",
],
)
11 changes: 11 additions & 0 deletions pkg/server/telemetry/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,17 @@ func (c CounterWithMetric) Inc() {
c.metric.Inc(1)
}

// CounterValue returns the telemetry value. Note that this value can be
// different from MetricValue because telemetry may reset to zero occasionally.
func (c CounterWithMetric) CounterValue() int32 {
return Read(c.telemetry)
}

// MetricValue returns the value of the metric, not the telemetry.
func (c CounterWithMetric) MetricValue() int64 {
return c.metric.Count()
}

// Forward the metric.Iterable interface to the metric counter. We
// don't just embed the counter because our Inc() interface is a bit
// different.
Expand Down
13 changes: 13 additions & 0 deletions pkg/server/telemetry/features_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"testing"

"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -91,3 +92,15 @@ func TestBucket(t *testing.T) {
}
}
}

// TestCounterWithMetric verifies that only the telemetry is reset to zero when,
// for example, a report is created.
func TestCounterWithMetric(t *testing.T) {
cm := telemetry.NewCounterWithMetric(metric.Metadata{})
cm.Inc()
require.Equal(t, int32(1), cm.CounterValue())
require.Equal(t, int64(1), cm.MetricValue())
telemetry.GetFeatureCounts(telemetry.Raw, telemetry.ResetCounts)
require.Equal(t, int32(0), cm.CounterValue())
require.Equal(t, int64(1), cm.MetricValue())
}

0 comments on commit a86b686

Please sign in to comment.