diff --git a/pkg/server/status/BUILD.bazel b/pkg/server/status/BUILD.bazel index 18181a395219..9f4f93cc7dad 100644 --- a/pkg/server/status/BUILD.bazel +++ b/pkg/server/status/BUILD.bazel @@ -51,6 +51,7 @@ go_library( "//pkg/server/status/statuspb", "//pkg/settings", "//pkg/settings/cluster", + "//pkg/sql/sem/catconstants", "//pkg/ts/tspb", "//pkg/ts/tsutil", "//pkg/util/cgroups", diff --git a/pkg/server/status/recorder.go b/pkg/server/status/recorder.go index 7874aef443b4..b20606c60e2a 100644 --- a/pkg/server/status/recorder.go +++ b/pkg/server/status/recorder.go @@ -20,6 +20,7 @@ import ( "os" "runtime" "strconv" + "sync" "sync/atomic" "time" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants" "github.com/cockroachdb/cockroach/pkg/ts/tspb" "github.com/cockroachdb/cockroach/pkg/ts/tsutil" "github.com/cockroachdb/cockroach/pkg/util/cgroups" @@ -60,8 +62,14 @@ const ( advertiseAddrLabelKey = "advertise-addr" httpAddrLabelKey = "http-addr" sqlAddrLabelKey = "sql-addr" + + disableNodeAndTenantLabelsEnvVar = "COCKROACH_DISABLE_NODE_AND_TENANT_METRIC_LABELS" ) +// This option is provided as an escape hatch for customers who have +// custom scrape logic that adds relevant labels already. +var disableNodeAndTenantLabels = envutil.EnvOrDefaultBool(disableNodeAndTenantLabelsEnvVar, false) + type quantile struct { suffix string quantile float64 @@ -130,6 +138,7 @@ type MetricsRecorder struct { // RLock on it. mu struct { syncutil.RWMutex + sync.Once // nodeRegistry contains, as subregistries, the multiple component-specific // registries which are recorded as "node level" metrics. nodeRegistry *metric.Registry @@ -187,6 +196,14 @@ func (mr *MetricsRecorder) AddTenantRegistry(tenantID roachpb.TenantID, rec *met mr.mu.Lock() defer mr.mu.Unlock() + if !disableNodeAndTenantLabels { + // If there are no in-process tenants running, we don't set the + // tenant label on the system tenant metrics until a seconary + // tenant is initialized. + mr.mu.Do(func() { + mr.mu.nodeRegistry.AddLabel("tenant", catconstants.SystemTenantName) + }) + } mr.mu.tenantRegistries[tenantID] = rec } @@ -226,12 +243,20 @@ func (mr *MetricsRecorder) AddNode( nodeIDGauge := metric.NewGauge(metadata) nodeIDGauge.Update(int64(desc.NodeID)) reg.AddMetric(nodeIDGauge) - reg.AddLabel("tenant", mr.tenantNameContainer) - reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) - // We assume that all stores have been added to the registry - // prior to calling `AddNode`. - for _, s := range mr.mu.storeRegistries { - s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + + if !disableNodeAndTenantLabels { + nodeIDInt := int(desc.NodeID) + if nodeIDInt != 0 { + reg.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + // We assume that all stores have been added to the registry + // prior to calling `AddNode`. + for _, s := range mr.mu.storeRegistries { + s.AddLabel("node_id", strconv.Itoa(int(desc.NodeID))) + } + } + if mr.tenantNameContainer != nil && mr.tenantNameContainer.String() != catconstants.SystemTenantName { + reg.AddLabel("tenant", mr.tenantNameContainer) + } } } diff --git a/pkg/server/status/recorder_test.go b/pkg/server/status/recorder_test.go index af4b2bd10eb0..d5f36fe353df 100644 --- a/pkg/server/status/recorder_test.go +++ b/pkg/server/status/recorder_test.go @@ -154,15 +154,15 @@ func TestMetricsRecorderLabels(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application",node_id="7"} 456`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="application"} 456`) bufTenant := bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application",node_id="7"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{node_id="7",tenant="application"} 456`) // Update app name in container and ensure // output changes accordingly. @@ -172,15 +172,15 @@ func TestMetricsRecorderLabels(t *testing.T) { err = recorder.PrintAsText(buf) require.NoError(t, err) - require.Contains(t, buf.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, buf.String(), `some_metric{tenant="application2",node_id="7"} 456`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, buf.String(), `some_metric{node_id="7",tenant="application2"} 456`) bufTenant = bytes.NewBuffer([]byte{}) err = recorderTenant.PrintAsText(bufTenant) require.NoError(t, err) - require.NotContains(t, bufTenant.String(), `some_metric{tenant="system",node_id="7"} 123`) - require.Contains(t, bufTenant.String(), `some_metric{tenant="application2",node_id="7"} 456`) + require.NotContains(t, bufTenant.String(), `some_metric{node_id="7",tenant="system"} 123`) + require.Contains(t, bufTenant.String(), `some_metric{node_id="7",tenant="application2"} 456`) // ======================================== // Verify that the recorder processes tenant time series registries diff --git a/pkg/server/status_test.go b/pkg/server/status_test.go index 1ed158db2f17..e42e8f6e0bdd 100644 --- a/pkg/server/status_test.go +++ b/pkg/server/status_test.go @@ -1339,20 +1339,20 @@ func TestStatusVarsTxnMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if !bytes.Contains(body, []byte("sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_txn_begin_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_begin_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_begin_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_restart_savepoint_release_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_restart_savepoint_release_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_restart_savepoint_release_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1")) { - t.Errorf("expected `sql_txn_commit_count{tenant=\"system\",node_id=\"1\"} 1`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_commit_count{node_id=\"1\"} 1")) { + t.Errorf("expected `sql_txn_commit_count{node_id=\"1\"} 1`, got: %s", body) } - if !bytes.Contains(body, []byte("sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0")) { - t.Errorf("expected `sql_txn_rollback_count{tenant=\"system\",node_id=\"1\"} 0`, got: %s", body) + if !bytes.Contains(body, []byte("sql_txn_rollback_count{node_id=\"1\"} 0")) { + t.Errorf("expected `sql_txn_rollback_count{node_id=\"1\"} 0`, got: %s", body) } }