From 240d54910dcbf73720d97bc42d00825c337ace79 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 16 Aug 2023 13:54:30 +0100 Subject: [PATCH 1/2] rpc: fix missed gauge Unlink The peerMetrics.ConnectionInactive gauge is not unlinked from the aggregated metric. Correspondingly, when a connection with the same peerKey is re-created, the metric with this key is already registered, which causes a panic. This commit introduces the missed metric Unlink. Fixes #108499 Epic: CRDB-21710 Release note: none --- pkg/rpc/metrics.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/rpc/metrics.go b/pkg/rpc/metrics.go index 161bdd39b095..c0bd3045608a 100644 --- a/pkg/rpc/metrics.go +++ b/pkg/rpc/metrics.go @@ -147,6 +147,7 @@ type peerMetrics struct { // must be reset before removing, and there must not be any chance that // they're set again because even if they're unlinked from the parent, they // will continue to add to the parent! + // TODO(pavelkalinnikov): unit-test that release() unlinks all metrics. // 1 on first heartbeat success (via reportHealthy), reset after // runHeartbeatUntilFailure returns. @@ -216,6 +217,7 @@ func (pm *peerMetrics) release() { pm.ConnectionHealthy.Unlink() pm.ConnectionUnhealthy.Unlink() + pm.ConnectionInactive.Unlink() pm.ConnectionHealthyFor.Unlink() pm.ConnectionUnhealthyFor.Unlink() pm.ConnectionHeartbeats.Unlink() From 6f737d90e06697cf981c35da196b3257187e57e7 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 16 Aug 2023 14:46:50 +0100 Subject: [PATCH 2/2] rpc: test that all metrics are (de)registered Epic: none Release note: none --- pkg/rpc/BUILD.bazel | 2 ++ pkg/rpc/metrics.go | 3 +- pkg/rpc/metrics_test.go | 66 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 pkg/rpc/metrics_test.go diff --git a/pkg/rpc/BUILD.bazel b/pkg/rpc/BUILD.bazel index 951b4bcb366d..8b7137b41e33 100644 --- a/pkg/rpc/BUILD.bazel +++ b/pkg/rpc/BUILD.bazel @@ -112,6 +112,7 @@ go_test( "heartbeat_test.go", "helpers_test.go", "main_test.go", + "metrics_test.go", "peer_test.go", "snappy_test.go", "tls_test.go", @@ -159,6 +160,7 @@ go_test( "@com_github_cockroachdb_redact//:redact", "@com_github_gogo_status//:status", "@com_github_golang_mock//gomock", + "@com_github_prometheus_client_model//go", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@io_etcd_go_raft_v3//raftpb", diff --git a/pkg/rpc/metrics.go b/pkg/rpc/metrics.go index c0bd3045608a..a26b1e781370 100644 --- a/pkg/rpc/metrics.go +++ b/pkg/rpc/metrics.go @@ -147,7 +147,8 @@ type peerMetrics struct { // must be reset before removing, and there must not be any chance that // they're set again because even if they're unlinked from the parent, they // will continue to add to the parent! - // TODO(pavelkalinnikov): unit-test that release() unlinks all metrics. + // + // See TestMetricsRelease. // 1 on first heartbeat success (via reportHealthy), reset after // runHeartbeatUntilFailure returns. diff --git a/pkg/rpc/metrics_test.go b/pkg/rpc/metrics_test.go new file mode 100644 index 000000000000..5d1c2dadc867 --- /dev/null +++ b/pkg/rpc/metrics_test.go @@ -0,0 +1,66 @@ +// Copyright 2023 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package rpc + +import ( + "reflect" + "testing" + + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + io_prometheus_client "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" +) + +// TestMetricsRelease verifies that peerMetrics.release() removes tracking for +// *all* the metrics from their parent aggregate metric. +func TestMetricsRelease(t *testing.T) { + defer leaktest.AfterTest(t)() + + // All metrics in aggmetric package satisfy this interface. The `Each` method + // can be used to scan all child metrics of the aggregated metric. We use it + // for counting children. + type eacher interface { + Each([]*io_prometheus_client.LabelPair, func(metric *io_prometheus_client.Metric)) + } + countChildren := func(metric eacher) (count int) { + metric.Each(nil /*labels*/, func(*io_prometheus_client.Metric) { + count++ + }) + return count + } + + verifyAllFields := func(m Metrics, wantChildren int) (metricFields int) { + r := reflect.ValueOf(m) + for i, n := 0, r.NumField(); i < n; i++ { + field := r.Field(i).Interface() + metric, ok := field.(eacher) + if !ok { // skip all non-metric fields + continue + } + metricFields++ + require.Equal(t, wantChildren, countChildren(metric), r.Type().Field(i).Name) + } + return metricFields + } + + m := makeMetrics() + // Verify that each metric doesn't have any children at first. Verify the + // number of metric fields, as a sanity check (to be modified if fields are + // added/deleted). + require.Equal(t, 8, verifyAllFields(m, 0)) + // Verify that a new peer's metrics all get registered. + k := peerKey{NodeID: 5, TargetAddr: "192.168.0.1:1234", Class: DefaultClass} + pm := m.acquire(k) + require.Equal(t, 8, verifyAllFields(m, 1)) + // Verify that all metrics are unlinked when the peer is released. + pm.release() + require.Equal(t, 8, verifyAllFields(m, 0)) +}