From 3750ea6ee7270617fc1d114229e2860a0a7a17a5 Mon Sep 17 00:00:00 2001 From: Matthew Todd Date: Mon, 5 Dec 2022 13:56:55 -0500 Subject: [PATCH] statusccl: temporarily serve /_status/nodes to tenants Fixes #92065 Part of #89949 Our current UI architecture has many of the SQL Activity pages mapping node or sql instance ids to regions at page load time, using the response from this `/_status/nodes` endpoint to get the information it needs. This approach is problematic for ephemeral serverless tenants but will need a broader reimagining, probably ending up with us storing locality information directly in many of the sqlstats tables, which will probably require something like sending instance localities along in the distsql tracing. An initial exploration in this direction happened in #92259. In the meantime, as a stop-gap measure, we implement a reduced version of this `/_status/nodes` endpoint for tenants, keeping the SQL Activity pages somewhat working for multiregion tenants. Release note: None --- .../serverccl/statusccl/tenant_status_test.go | 51 ++++++++++++++++++- pkg/ccl/serverccl/tenant_test_utils.go | 33 ++++++++---- pkg/server/serverpb/status.go | 1 + pkg/server/status.go | 38 ++++++++++++++ .../cluster-ui/src/util/proto.spec.ts | 5 ++ .../workspaces/cluster-ui/src/util/proto.ts | 3 ++ 6 files changed, 120 insertions(+), 11 deletions(-) diff --git a/pkg/ccl/serverccl/statusccl/tenant_status_test.go b/pkg/ccl/serverccl/statusccl/tenant_status_test.go index 6c7409451487..37e3c90b0a09 100644 --- a/pkg/ccl/serverccl/statusccl/tenant_status_test.go +++ b/pkg/ccl/serverccl/statusccl/tenant_status_test.go @@ -60,7 +60,21 @@ func TestTenantStatusAPI(t *testing.T) { StoreDisableCoalesceAdjacent: true, } - testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs) + tenantLocalities := []roachpb.Locality{ + {Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-west1"}}}, + {Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-central1"}}}, + {Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-east1"}}}, + } + + testHelper := serverccl.NewTestTenantHelperWithTenantArgs( + t, + 3, /* tenantClusterSize */ + knobs, + func(tenantIdx int, tenantArgs *base.TestTenantArgs) { + tenantArgs.Locality = tenantLocalities[tenantIdx] + }, + ) + defer testHelper.Cleanup(ctx, t) t.Run("reset_sql_stats", func(t *testing.T) { @@ -114,6 +128,10 @@ func TestTenantStatusAPI(t *testing.T) { t.Run("tenant_logs", func(t *testing.T) { testTenantLogs(ctx, t, testHelper) }) + + t.Run("tenant_nodes", func(t *testing.T) { + testTenantNodes(ctx, t, testHelper, tenantLocalities) + }) } func testTenantLogs(ctx context.Context, t *testing.T, helper serverccl.TenantTestHelper) { @@ -1239,6 +1257,37 @@ func testTenantAuthOnStatements( require.NoError(t, err) } +func testTenantNodes( + ctx context.Context, + t *testing.T, + helper serverccl.TenantTestHelper, + tenantLocalities []roachpb.Locality, +) { + // TODO(todd): Get this test working over HTTP, #93267. + // + // When I first tried, I ran into errors about the SQL user + // `authentic_user_noadmin` already existing, but at first glance it seemed + // that the sync.Once mechanism used in its setup had already accounted + // for that scenario; that we're properly passing by reference all the way + // through; and that these tests aren't being run in parallel. + tenantA := helper.TestCluster().TenantStatusSrv(0) + resp, err := tenantA.Nodes(ctx, &serverpb.NodesRequest{}) + require.NoError(t, err) + + expected := map[roachpb.NodeID]roachpb.Locality{ + 1: tenantLocalities[0], + 2: tenantLocalities[1], + 3: tenantLocalities[2], + } + + actual := make(map[roachpb.NodeID]roachpb.Locality) + for _, node := range resp.Nodes { + actual[node.Desc.NodeID] = node.Desc.Locality + } + + require.Equal(t, expected, actual) +} + // assertStartKeyInRange compares the pretty printed startKey with the provided // tenantPrefix key, ensuring that the startKey starts with the tenantPrefix. func assertStartKeyInRange(t *testing.T, startKey string, tenantPrefix roachpb.Key) { diff --git a/pkg/ccl/serverccl/tenant_test_utils.go b/pkg/ccl/serverccl/tenant_test_utils.go index 39d01ba19ef1..d955e29fdf1d 100644 --- a/pkg/ccl/serverccl/tenant_test_utils.go +++ b/pkg/ccl/serverccl/tenant_test_utils.go @@ -93,17 +93,11 @@ type TestTenant interface { var _ TestTenant = &testTenant{} func newTestTenant( - t *testing.T, - server serverutils.TestServerInterface, - tenantID roachpb.TenantID, - knobs base.TestingKnobs, + t *testing.T, server serverutils.TestServerInterface, args base.TestTenantArgs, ) TestTenant { t.Helper() - tenantParams := tests.CreateTestTenantParams(tenantID) - tenantParams.TestingKnobs = knobs - - tenant, tenantConn := serverutils.StartTenant(t, server, tenantParams) + tenant, tenantConn := serverutils.StartTenant(t, server, args) sqlDB := sqlutils.MakeSQLRunner(tenantConn) status := tenant.StatusServer().(serverpb.SQLStatusServer) sqlStats := tenant.PGServer().(*pgwire.Server).SQLServer. @@ -151,6 +145,20 @@ func NewTestTenantHelper( ) TenantTestHelper { t.Helper() + return NewTestTenantHelperWithTenantArgs(t, tenantClusterSize, knobs, func(int, *base.TestTenantArgs) {}) +} + +// NewTestTenantHelperWithTenantArgs constructs a TenantTestHelper instance, +// offering the caller the opportunity to modify the arguments passed when +// starting each tenant. +func NewTestTenantHelperWithTenantArgs( + t *testing.T, + tenantClusterSize int, + knobs base.TestingKnobs, + customizeTenantArgs func(tenantIdx int, tenantArgs *base.TestTenantArgs), +) TenantTestHelper { + t.Helper() + params, _ := tests.CreateTestServerParams() params.Knobs = knobs // We're running tenant tests, no need for a default tenant. @@ -168,6 +176,7 @@ func NewTestTenantHelper( tenantClusterSize, security.EmbeddedTenantIDs()[0], knobs, + customizeTenantArgs, ), // Spin up a small tenant cluster under a different tenant ID to test // tenant isolation. @@ -177,6 +186,7 @@ func NewTestTenantHelper( 1, /* tenantClusterSize */ security.EmbeddedTenantIDs()[1], knobs, + func(int, *base.TestTenantArgs) {}, ), } } @@ -224,13 +234,16 @@ func newTenantClusterHelper( tenantClusterSize int, tenantID uint64, knobs base.TestingKnobs, + customizeTenantArgs func(tenantIdx int, tenantArgs *base.TestTenantArgs), ) TenantClusterHelper { t.Helper() var cluster tenantCluster = make([]TestTenant, tenantClusterSize) for i := 0; i < tenantClusterSize; i++ { - cluster[i] = - newTestTenant(t, server, roachpb.MustMakeTenantID(tenantID), knobs) + args := tests.CreateTestTenantParams(roachpb.MustMakeTenantID(tenantID)) + args.TestingKnobs = knobs + customizeTenantArgs(i, &args) + cluster[i] = newTestTenant(t, server, args) } return cluster diff --git a/pkg/server/serverpb/status.go b/pkg/server/serverpb/status.go index 1885f5040993..74a4c064b874 100644 --- a/pkg/server/serverpb/status.go +++ b/pkg/server/serverpb/status.go @@ -39,6 +39,7 @@ type SQLStatusServer interface { UserSQLRoles(context.Context, *UserSQLRolesRequest) (*UserSQLRolesResponse, error) TxnIDResolution(context.Context, *TxnIDResolutionRequest) (*TxnIDResolutionResponse, error) TransactionContentionEvents(context.Context, *TransactionContentionEventsRequest) (*TransactionContentionEventsResponse, error) + Nodes(context.Context, *NodesRequest) (*NodesResponse, error) NodesList(context.Context, *NodesListRequest) (*NodesListResponse, error) ListExecutionInsights(context.Context, *ListExecutionInsightsRequest) (*ListExecutionInsightsResponse, error) LogFilesList(context.Context, *LogFilesListRequest) (*LogFilesListResponse, error) diff --git a/pkg/server/status.go b/pkg/server/status.go index d7d951ef0153..d6c22e8bdb50 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -1491,6 +1491,44 @@ func (s *statusServer) NodesList( return s.serverIterator.nodesList(ctx) } +// Nodes returns a limited subset of the full NodesResponse. +// +// Its existence is a stop-gap measure to support running much of our UI code +// on multiregion tenant clusters. Longer-term, we will need to reconsider +// the overall architecture (e.g. store node ids in sqlstats, map node id -> +// region at UI display time) because sql instances are far more ephemeral +// than cluster nodes, and this Nodes endpoint is only able to answer for +// currently-live sql instances. What I think we want is to store locality +// information directly in the sqlstats tables, sourced from some new tracing +// in the distsql flow. +func (s *statusServer) Nodes( + ctx context.Context, req *serverpb.NodesRequest, +) (*serverpb.NodesResponse, error) { + ctx = propagateGatewayMetadata(ctx) + ctx = s.AnnotateCtx(ctx) + + if err := s.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil { + // NB: not using serverError() here since the priv checker + // already returns a proper gRPC error status. + return nil, err + } + + instances, err := s.sqlServer.sqlInstanceReader.GetAllInstances(ctx) + if err != nil { + return nil, err + } + var resp serverpb.NodesResponse + for _, instance := range instances { + resp.Nodes = append(resp.Nodes, statuspb.NodeStatus{ + Desc: roachpb.NodeDescriptor{ + NodeID: roachpb.NodeID(instance.InstanceID), + Locality: instance.Locality, + }, + }) + } + return &resp, err +} + // Nodes returns all node statuses. // // Do not use this method inside the server code! Use diff --git a/pkg/ui/workspaces/cluster-ui/src/util/proto.spec.ts b/pkg/ui/workspaces/cluster-ui/src/util/proto.spec.ts index c63d1f19ae92..a84385e3b54c 100644 --- a/pkg/ui/workspaces/cluster-ui/src/util/proto.spec.ts +++ b/pkg/ui/workspaces/cluster-ui/src/util/proto.spec.ts @@ -66,5 +66,10 @@ describe("Proto utils", () => { statusWithRolledMetrics.metrics, ); }); + + it("does not explode when node fields are missing", () => { + const emptyNodeStatus: Partial = {}; + assert.deepEqual(rollupStoreMetrics(emptyNodeStatus), {}); + }); }); }); diff --git a/pkg/ui/workspaces/cluster-ui/src/util/proto.ts b/pkg/ui/workspaces/cluster-ui/src/util/proto.ts index 6e6058191a91..b48dfb9209d4 100644 --- a/pkg/ui/workspaces/cluster-ui/src/util/proto.ts +++ b/pkg/ui/workspaces/cluster-ui/src/util/proto.ts @@ -21,6 +21,9 @@ export type StatusMetrics = typeof nodeStatus.metrics; * with metrics from `store_statuses.metrics` object. */ export function rollupStoreMetrics(ns: INodeStatus): StatusMetrics { + if (!ns.store_statuses) { + return ns.metrics || {}; + } return ns.store_statuses .map(ss => ss.metrics) .reduce((acc, i) => {