From 952195128ac361220fed0f27cb992f6e060ab7d8 Mon Sep 17 00:00:00 2001 From: David Hartunian Date: Fri, 24 Mar 2023 19:47:05 +0000 Subject: [PATCH] metrics: improve ux around _status/vars output Previously, the addition of the `tenant` metric label was applied uniformly and could result in confusion for customers who never enable multi-tenancy or c2c. The `tenant="system"` label carries little meaning when there's no tenancy in use. This change modifies the system tenant label application to only happen when a non- sytem in-process tenant is created. Additionally, an environment variable: `COCKROACH_DISABLE_NODE_AND_TENANT_METRIC_LABELS` can be set to `false` to disable the new `tenant` and `node_id` labels. This can be used on single-process tenants to disable the `tenant` label. When the `tenantNameContainer` is nil, or the `nodeID` is set to 0, the labels will not be applied during recorder configuration on startup. This is currently the case when running a separate process tenant using `mt start-sql`. Those tenants *will not* have `tenant` or `nodeID` labels available. Resolves: #94668 Epic: CRDB-18798 Release note (ops change): The `COCKROACH_DISABLE_NODE_AND_TENANT_METRIC_LABELS` env var can be used to disable the newly introduced metric labels in the `_status/vars` output if they conflict with a customer's scrape configuration. --- pkg/server/status/BUILD.bazel | 1 + pkg/server/status/recorder.go | 37 +++++++++++++++++++++++++----- pkg/server/status/recorder_test.go | 16 ++++++------- pkg/server/status_test.go | 20 ++++++++-------- 4 files changed, 50 insertions(+), 24 deletions(-) diff --git a/pkg/server/status/BUILD.bazel b/pkg/server/status/BUILD.bazel index 18181a395219..9f4f93cc7dad 100644 --- a/pkg/server/status/BUILD.bazel +++ b/pkg/server/status/BUILD.bazel @@ -51,6 +51,7 @@ go_library( "//pkg/server/status/statuspb", "//pkg/settings", "//pkg/settings/cluster", + "//pkg/sql/sem/catconstants", "//pkg/ts/tspb", "//pkg/ts/tsutil", "//pkg/util/cgroups", diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 7874aef443b4..b20606c60e2a 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -20,6 +20,7 @@ import ( "os" "runtime" "strconv" + "sync" "sync/atomic" "time" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants" "github.com/cockroachdb/cockroach/pkg/ts/tspb" "github.com/cockroachdb/cockroach/pkg/ts/tsutil" "github.com/cockroachdb/cockroach/pkg/util/cgroups" @@ -60,8 +62,14 @@ const ( advertiseAddrLabelKey = "advertise-addr" httpAddrLabelKey = "http-addr" sqlAddrLabelKey = "sql-addr" + + disableNodeAndTenantLabelsEnvVar = "COCKROACH_DISABLE_NODE_AND_TENANT_METRIC_LABELS" ) +// This option is provided as an escape hatch for customers who have +// custom scrape logic that adds relevant labels already. +var disableNodeAndTenantLabels = envutil.EnvOrDefaultBool(disableNodeAndTenantLabelsEnvVar, false) + type quantile struct { suffix string quantile float64 @@ -130,6 +138,7 @@ type MetricsRecorder struct { // RLock on it. mu struct { syncutil.RWMutex + sync.Once // nodeRegistry contains, as subregistries, the multiple component-specific // registries which are recorded as "node level" metrics. nodeRegistry *metric.Registry @@ -187,6 +196,14 @@ func (mr *MetricsRecorder) AddTenantRegistry(tenantID roachpb.TenantID, rec *met mr.mu.Lock() defer mr.mu.Unlock() + if !disableNodeAndTenantLabels { + // If there are no in-process tenants running, we don't set the + // tenant label on the system tenant metrics until a seconary + // tenant is initialized. + mr.mu.Do(func() { + mr.mu.nodeRegistry.AddLabel("tenant", catconstants.SystemTenantName) + }) + } mr.mu.tenantRegistries[tenantID] = rec } @@ -226,12 +243,20 @@ func (mr *MetricsRecorder) AddNode( nodeIDGauge := metric.NewGauge(metadata) nodeIDGauge.Update(int64(desc.NodeID)) reg.AddMetric(nodeIDGauge) - reg.AddLabel("tenant", mr.tenantNameContainer) - reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) - // We assume that all stores have been added to the registry - // prior to calling `AddNode`. - for _, s := range mr.mu.storeRegistries { - s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + + if !disableNodeAndTenantLabels { + nodeIDInt := int(desc.NodeID) + if nodeIDInt != 0 { + reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + // We assume that all stores have been added to the registry + // prior to calling `AddNode`. + for _, s := range mr.mu.storeRegistries { + s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + } + } + if mr.tenantNameContainer != nil && mr.tenantNameContainer.String() != catconstants.SystemTenantName { + reg.AddLabel("tenant", mr.tenantNameContainer) + } } } diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index af4b2bd10eb0..d5f36fe353df 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -154,15 +154,15 @@ func TestMetricsRecorderLabels(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application",node_id="7"} 456`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="application"} 456`) bufTenant := bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application",node_id="7"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{node_id="7",tenant="application"} 456`) // Update app name in container and ensure // output changes accordingly. @@ -172,15 +172,15 @@ func TestMetricsRecorderLabels(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application2",node_id="7"} 456`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="application2"} 456`) bufTenant = bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application2",node_id="7"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{node_id="7",tenant="application2"} 456`) // ======================================== // Verify that the recorder processes tenant time series registries diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go index 1ed158db2f17..e42e8f6e0bdd 100644 --- a/pkg/server/status_test.go +++ b/pkg/server/status_test.go @@ -1339,20 +1339,20 @@ func TestStatusVarsTxnMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_begin_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_begin_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_release_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_commit_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_commit_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0")) { - t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_rollback_count{node_id=\"1\"} 0")) { + t.Errorf("expected `sql_txn_rollback_count{node_id=\"1\"} 0`, got: %s", body) } }