From 2353a6a25a9750188a3a02dfee067fe2601b4278 Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Tue, 14 Mar 2023 18:19:03 -0400 Subject: [PATCH] server: add `node_id` label to _status/vars output Previously, the output of the prometheus metrics via `_status/ vars` did not include any node labels. This caused challenges for customers who want to monitor large clusters as it requires additional configuration on the scrape- side to ensure a node ID is added to the metrics. This can be challenging to deal with when nodes come and go in a cluster and the scrape configuration must change as well. This change adds a `node_id` prometheus label to the metrics we output that matches the current node's ID. Since `_status/vars` is output from a single node there is only ever one single value that's appropriate here. Secondary tenants will mark their metrics with either the nodeID of the shared- process system tenant, or the instanceID of the tenant process. Resolves: #94763 Epic: None Release note (ops change): Prometheus metrics available at the `_status/vars` path now contain a `node_id` label that identifies the node they were scraped from. --- pkg/server/node_http_router_test.go | 10 ++++----- pkg/server/status/recorder.go | 6 ++++++ pkg/server/status/recorder_test.go | 32 ++++++++++++++--------------- pkg/server/status_test.go | 20 +++++++++--------- 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/pkg/server/node_http_router_test.go b/pkg/server/node_http_router_test.go index fc771ee82099..d45d681c51ce 100644 --- a/pkg/server/node_http_router_test.go +++ b/pkg/server/node_http_router_test.go @@ -53,7 +53,7 @@ func TestRouteToNode(t *testing.T) { sourceServerID: 1, nodeIDRequestedInCookie: "local", expectStatusCode: 200, - expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`), + expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`), }, { name: "remote _status/vars on node 2 from node 1 using cookie", @@ -61,7 +61,7 @@ func TestRouteToNode(t *testing.T) { sourceServerID: 0, nodeIDRequestedInCookie: "2", expectStatusCode: 200, - expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`), + expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`), }, { name: "remote _status/vars on node 1 from node 2 using cookie", @@ -69,7 +69,7 @@ func TestRouteToNode(t *testing.T) { sourceServerID: 1, nodeIDRequestedInCookie: "1", expectStatusCode: 200, - expectRegex: regexp.MustCompile(`ranges_underreplicated{store="1"}`), + expectRegex: regexp.MustCompile(`ranges_underreplicated{store="1",node_id="1"}`), }, { name: "remote _status/vars on node 2 from node 1 using query param", @@ -77,7 +77,7 @@ func TestRouteToNode(t *testing.T) { sourceServerID: 0, nodeIDRequestedInQueryParam: "2", expectStatusCode: 200, - expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`), + expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`), }, { name: "query param overrides cookie", @@ -86,7 +86,7 @@ func TestRouteToNode(t *testing.T) { nodeIDRequestedInCookie: "local", nodeIDRequestedInQueryParam: "2", expectStatusCode: 200, - expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`), + expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`), }, { name: "remote / root HTML on node 2 from node 1 using cookie", diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 72cf79727044..7874aef443b4 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -227,6 +227,12 @@ func (mr *MetricsRecorder) AddNode( nodeIDGauge.Update(int64(desc.NodeID)) reg.AddMetric(nodeIDGauge) reg.AddLabel("tenant", mr.tenantNameContainer) + reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + // We assume that all stores have been added to the registry + // prior to calling `AddNode`. + for _, s := range mr.mu.storeRegistries { + s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + } } // AddStore adds the Registry from the provided store as a store-level registry diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index c9bbc7a77a71..af4b2bd10eb0 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -99,10 +99,10 @@ func (fs fakeStore) Registry() *metric.Registry { return fs.registry } -func TestMetricsRecorderTenants(t *testing.T) { +func TestMetricsRecorderLabels(t *testing.T) { defer leaktest.AfterTest(t)() nodeDesc := roachpb.NodeDescriptor{ - NodeID: roachpb.NodeID(1), + NodeID: roachpb.NodeID(7), } reg1 := metric.NewRegistry() manual := timeutil.NewManualTime(timeutil.Unix(0, 100)) @@ -118,7 +118,7 @@ func TestMetricsRecorderTenants(t *testing.T) { recorder.AddNode(reg1, nodeDesc, 50, "foo:26257", "foo:26258", "foo:5432") nodeDescTenant := roachpb.NodeDescriptor{ - NodeID: roachpb.NodeID(1), + NodeID: roachpb.NodeID(7), } regTenant := metric.NewRegistry() stTenant := cluster.MakeTestingClusterSettings() @@ -154,15 +154,15 @@ func TestMetricsRecorderTenants(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application"} 456`) + require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) + require.Contains(t, buf.String(), `some_metric{tenant="application",node_id="7"} 456`) bufTenant := bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{tenant="application",node_id="7"} 456`) // Update app name in container and ensure // output changes accordingly. @@ -172,15 +172,15 @@ func TestMetricsRecorderTenants(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application2"} 456`) + require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) + require.Contains(t, buf.String(), `some_metric{tenant="application2",node_id="7"} 456`) bufTenant = bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application2"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{tenant="application2",node_id="7"} 456`) // ======================================== // Verify that the recorder processes tenant time series registries @@ -190,17 +190,17 @@ func TestMetricsRecorderTenants(t *testing.T) { // System tenant metrics { Name: "cr.node.node-id", - Source: "1", + Source: "7", Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: manual.Now().UnixNano(), - Value: float64(1), + Value: float64(7), }, }, }, { Name: "cr.node.some_metric", - Source: "1", + Source: "7", Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: manual.Now().UnixNano(), @@ -211,7 +211,7 @@ func TestMetricsRecorderTenants(t *testing.T) { // App tenant metrics { Name: "cr.node.node-id", - Source: "1-123", + Source: "7-123", Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: manual.Now().UnixNano(), @@ -221,7 +221,7 @@ func TestMetricsRecorderTenants(t *testing.T) { }, { Name: "cr.node.some_metric", - Source: "1-123", + Source: "7-123", Datapoints: []tspb.TimeSeriesDatapoint{ { TimestampNanos: manual.Now().UnixNano(), diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go index a3d52af4b6ac..1ed158db2f17 100644 --- a/pkg/server/status_test.go +++ b/pkg/server/status_test.go @@ -1339,20 +1339,20 @@ func TestStatusVarsTxnMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\"} 1")) { - t.Errorf("expected `sql_txn_begin_count{tenant=\"system\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\"} 1")) { - t.Errorf("expected `sql_txn_commit_count{tenant=\"system\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\"} 0")) { - t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\"} 0`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0")) { + t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0`, got: %s", body) } }