Skip to content

Commit

Permalink
server: add node_id label to _status/vars output
Browse files Browse the repository at this point in the history
Previously, the output of the prometheus metrics via `_status/
vars` did not include any node labels. This caused challenges for
customers who want to monitor large clusters as it requires additional
configuration on the scrape- side to ensure a node ID is added to the
metrics. This can be challenging to deal with when nodes come and go
in a cluster and the scrape configuration must change as well.

This change adds a `node_id` prometheus label to the metrics we
output that matches the current node's ID. Since `_status/vars` is
output from a single node there is only ever one single value that's
appropriate here.

Secondary tenants will mark their metrics with either the nodeID of
the shared- process system tenant, or the instanceID of the tenant
process.

Resolves: #94763
Epic: None

Release note (ops change): Prometheus metrics available at the
`_status/vars` path now contain a `node_id` label that identifies the
node they were scraped from.
  • Loading branch information
dhartunian committed Mar 21, 2023
1 parent b26b338 commit 2353a6a
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 31 deletions.
10 changes: 5 additions & 5 deletions pkg/server/node_http_router_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,31 +53,31 @@ func TestRouteToNode(t *testing.T) {
sourceServerID: 1,
nodeIDRequestedInCookie: "local",
expectStatusCode: 200,
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`),
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`),
},
{
name: "remote _status/vars on node 2 from node 1 using cookie",
path: "/_status/vars",
sourceServerID: 0,
nodeIDRequestedInCookie: "2",
expectStatusCode: 200,
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`),
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`),
},
{
name: "remote _status/vars on node 1 from node 2 using cookie",
path: "/_status/vars",
sourceServerID: 1,
nodeIDRequestedInCookie: "1",
expectStatusCode: 200,
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="1"}`),
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="1",node_id="1"}`),
},
{
name: "remote _status/vars on node 2 from node 1 using query param",
path: "/_status/vars",
sourceServerID: 0,
nodeIDRequestedInQueryParam: "2",
expectStatusCode: 200,
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`),
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`),
},
{
name: "query param overrides cookie",
Expand All @@ -86,7 +86,7 @@ func TestRouteToNode(t *testing.T) {
nodeIDRequestedInCookie: "local",
nodeIDRequestedInQueryParam: "2",
expectStatusCode: 200,
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2"}`),
expectRegex: regexp.MustCompile(`ranges_underreplicated{store="2",node_id="2"}`),
},
{
name: "remote / root HTML on node 2 from node 1 using cookie",
Expand Down
6 changes: 6 additions & 0 deletions pkg/server/status/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,12 @@ func (mr *MetricsRecorder) AddNode(
nodeIDGauge.Update(int64(desc.NodeID))
reg.AddMetric(nodeIDGauge)
reg.AddLabel("tenant", mr.tenantNameContainer)
reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID)))
// We assume that all stores have been added to the registry
// prior to calling `AddNode`.
for _, s := range mr.mu.storeRegistries {
s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID)))
}
}

// AddStore adds the Registry from the provided store as a store-level registry
Expand Down
32 changes: 16 additions & 16 deletions pkg/server/status/recorder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ func (fs fakeStore) Registry() *metric.Registry {
return fs.registry
}

func TestMetricsRecorderTenants(t *testing.T) {
func TestMetricsRecorderLabels(t *testing.T) {
defer leaktest.AfterTest(t)()
nodeDesc := roachpb.NodeDescriptor{
NodeID: roachpb.NodeID(1),
NodeID: roachpb.NodeID(7),
}
reg1 := metric.NewRegistry()
manual := timeutil.NewManualTime(timeutil.Unix(0, 100))
Expand All @@ -118,7 +118,7 @@ func TestMetricsRecorderTenants(t *testing.T) {
recorder.AddNode(reg1, nodeDesc, 50, "foo:26257", "foo:26258", "foo:5432")

nodeDescTenant := roachpb.NodeDescriptor{
NodeID: roachpb.NodeID(1),
NodeID: roachpb.NodeID(7),
}
regTenant := metric.NewRegistry()
stTenant := cluster.MakeTestingClusterSettings()
Expand Down Expand Up @@ -154,15 +154,15 @@ func TestMetricsRecorderTenants(t *testing.T) {
err = recorder.PrintAsText(buf)
require.NoError(t, err)

require.Contains(t, buf.String(), `some_metric{tenant="system"} 123`)
require.Contains(t, buf.String(), `some_metric{tenant="application"} 456`)
require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`)
require.Contains(t, buf.String(), `some_metric{tenant="application",node_id="7"} 456`)

bufTenant := bytes.NewBuffer([]byte{})
err = recorderTenant.PrintAsText(bufTenant)
require.NoError(t, err)

require.NotContains(t, bufTenant.String(), `some_metric{tenant="system"} 123`)
require.Contains(t, bufTenant.String(), `some_metric{tenant="application"} 456`)
require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`)
require.Contains(t, bufTenant.String(), `some_metric{tenant="application",node_id="7"} 456`)

// Update app name in container and ensure
// output changes accordingly.
Expand All @@ -172,15 +172,15 @@ func TestMetricsRecorderTenants(t *testing.T) {
err = recorder.PrintAsText(buf)
require.NoError(t, err)

require.Contains(t, buf.String(), `some_metric{tenant="system"} 123`)
require.Contains(t, buf.String(), `some_metric{tenant="application2"} 456`)
require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`)
require.Contains(t, buf.String(), `some_metric{tenant="application2",node_id="7"} 456`)

bufTenant = bytes.NewBuffer([]byte{})
err = recorderTenant.PrintAsText(bufTenant)
require.NoError(t, err)

require.NotContains(t, bufTenant.String(), `some_metric{tenant="system"} 123`)
require.Contains(t, bufTenant.String(), `some_metric{tenant="application2"} 456`)
require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`)
require.Contains(t, bufTenant.String(), `some_metric{tenant="application2",node_id="7"} 456`)

// ========================================
// Verify that the recorder processes tenant time series registries
Expand All @@ -190,17 +190,17 @@ func TestMetricsRecorderTenants(t *testing.T) {
// System tenant metrics
{
Name: "cr.node.node-id",
Source: "1",
Source: "7",
Datapoints: []tspb.TimeSeriesDatapoint{
{
TimestampNanos: manual.Now().UnixNano(),
Value: float64(1),
Value: float64(7),
},
},
},
{
Name: "cr.node.some_metric",
Source: "1",
Source: "7",
Datapoints: []tspb.TimeSeriesDatapoint{
{
TimestampNanos: manual.Now().UnixNano(),
Expand All @@ -211,7 +211,7 @@ func TestMetricsRecorderTenants(t *testing.T) {
// App tenant metrics
{
Name: "cr.node.node-id",
Source: "1-123",
Source: "7-123",
Datapoints: []tspb.TimeSeriesDatapoint{
{
TimestampNanos: manual.Now().UnixNano(),
Expand All @@ -221,7 +221,7 @@ func TestMetricsRecorderTenants(t *testing.T) {
},
{
Name: "cr.node.some_metric",
Source: "1-123",
Source: "7-123",
Datapoints: []tspb.TimeSeriesDatapoint{
{
TimestampNanos: manual.Now().UnixNano(),
Expand Down
20 changes: 10 additions & 10 deletions pkg/server/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1339,20 +1339,20 @@ func TestStatusVarsTxnMetrics(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\"} 1")) {
t.Errorf("expected `sql_txn_begin_count{tenant=\"system\"} 1`, got: %s", body)
if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1")) {
t.Errorf("expected `sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body)
}
if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\"} 1")) {
t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\"} 1`, got: %s", body)
if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1")) {
t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body)
}
if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\"} 1")) {
t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\"} 1`, got: %s", body)
if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1")) {
t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body)
}
if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\"} 1")) {
t.Errorf("expected `sql_txn_commit_count{tenant=\"system\"} 1`, got: %s", body)
if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1")) {
t.Errorf("expected `sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body)
}
if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\"} 0")) {
t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\"} 0`, got: %s", body)
if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0")) {
t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0`, got: %s", body)
}
}

Expand Down

0 comments on commit 2353a6a

Please sign in to comment.