diff --git a/dev b/dev index 49fd459e6306..9c0955b0a165 100755 --- a/dev +++ b/dev @@ -3,7 +3,7 @@ set -euo pipefail # Bump this counter to force rebuilding `dev` on all machines. -DEV_VERSION=11 +DEV_VERSION=12 THIS_DIR=$(cd "$(dirname "$0")" && pwd) BINARY_DIR=$THIS_DIR/bin/dev-versions diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index 9fdfb19932da..0058540ac85a 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -209,6 +209,71 @@ RegionsResponse describes the available regions. +## NodesList + + + +NodesList returns all available nodes with their addresses. + +Support status: [reserved](#support-status) + +#### Request Parameters + + + + +NodesListRequest requests list of all nodes. +The nodes are KV nodes when the cluster is a single +tenant cluster or the host cluster in case of multi-tenant +clusters. +The nodes are SQL instances in case of multi-tenant +clusters. + + + + + + + + +#### Response Parameters + + + + +NodesListResponse contains a list of all nodes with their addresses. +The nodes are KV nodes when the cluster is a single +tenant cluster or the host cluster in case of multi-tenant +clusters. +The nodes are SQL instances in case of multi-tenant +clusters. + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| nodes | [NodeDetails](#cockroach.server.serverpb.NodesListResponse-cockroach.server.serverpb.NodeDetails) | repeated | nodes contains a list of NodeDetails. Each individual node within the list is a SQL node in case of a tenant server and KV nodes in case of a KV server. | [reserved](#support-status) | + + + + + + + +#### NodeDetails + + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| node_id | [int32](#cockroach.server.serverpb.NodesListResponse-int32) | | node_id is a unique identifier for the node. This corresponds to SQL instance ID for a tenant server and KV node id for for a KV server. | [reserved](#support-status) | +| address | [cockroach.util.UnresolvedAddr](#cockroach.server.serverpb.NodesListResponse-cockroach.util.UnresolvedAddr) | | address is the RPC address for a KV node. This will be set to null for a tenant server node. | [reserved](#support-status) | +| sql_address | [cockroach.util.UnresolvedAddr](#cockroach.server.serverpb.NodesListResponse-cockroach.util.UnresolvedAddr) | | sql_address is the SQL address for a node. | [reserved](#support-status) | + + + + + + ## Nodes `GET /_status/nodes` diff --git a/pkg/cli/BUILD.bazel b/pkg/cli/BUILD.bazel index 171bb6e9eec6..368522e923a4 100644 --- a/pkg/cli/BUILD.bazel +++ b/pkg/cli/BUILD.bazel @@ -313,6 +313,7 @@ go_test( "userfiletable_test.go", "workload_test.go", "zip_helpers_test.go", + "zip_tenant_test.go", "zip_test.go", ], data = glob(["testdata/**"]), @@ -320,6 +321,7 @@ go_test( deps = [ "//pkg/base", "//pkg/build", + "//pkg/ccl/kvccl/kvtenantccl", "//pkg/cli/clicfg", "//pkg/cli/clierror", "//pkg/cli/clierrorplus", diff --git a/pkg/cli/main_test.go b/pkg/cli/main_test.go index e860d1f0d64f..cd418d55a829 100644 --- a/pkg/cli/main_test.go +++ b/pkg/cli/main_test.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/security/securitytest" "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" ) func init() { @@ -29,8 +30,8 @@ func TestMain(m *testing.M) { // CLI tests are sensitive to the server version, but test binaries don't have // a version injected. Pretend to be a very up-to-date version. defer build.TestingOverrideTag("v999.0.0")() - serverutils.InitTestServerFactory(server.TestServerFactory) + serverutils.InitTestClusterFactory(testcluster.TestClusterFactory) os.Exit(m.Run()) } diff --git a/pkg/cli/testdata/zip/testzip_tenant b/pkg/cli/testdata/zip/testzip_tenant new file mode 100644 index 000000000000..bab810cbf5df --- /dev/null +++ b/pkg/cli/testdata/zip/testzip_tenant @@ -0,0 +1,123 @@ +zip +---- +debug zip --concurrency=1 /dev/null +[cluster] establishing RPC connection to ... +[cluster] retrieving the node status to get the SQL address... done +[cluster] using SQL address: ... +[cluster] creating output file /dev/null... done +[cluster] requesting data for debug/events... received response... +[cluster] requesting data for debug/events: last request failed: rpc error: ... +[cluster] requesting data for debug/events: creating error output: debug/events.json.err.txt... done +[cluster] requesting data for debug/rangelog... received response... +[cluster] requesting data for debug/rangelog: last request failed: rpc error: ... +[cluster] requesting data for debug/rangelog: creating error output: debug/rangelog.json.err.txt... done +[cluster] requesting data for debug/settings... received response... +[cluster] requesting data for debug/settings: last request failed: rpc error: ... +[cluster] requesting data for debug/settings: creating error output: debug/settings.json.err.txt... done +[cluster] requesting data for debug/reports/problemranges... received response... +[cluster] requesting data for debug/reports/problemranges: last request failed: rpc error: ... +[cluster] requesting data for debug/reports/problemranges: creating error output: debug/reports/problemranges.json.err.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_settings... writing output: debug/crdb_internal.cluster_settings.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_transactions... writing output: debug/crdb_internal.cluster_transactions.txt... done +[cluster] retrieving SQL data for crdb_internal.default_privileges... writing output: debug/crdb_internal.default_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.jobs... writing output: debug/crdb_internal.jobs.txt... done +[cluster] retrieving SQL data for system.jobs... writing output: debug/system.jobs.txt... done +[cluster] retrieving SQL data for system.descriptor... writing output: debug/system.descriptor.txt... done +[cluster] retrieving SQL data for system.namespace... writing output: debug/system.namespace.txt... done +[cluster] retrieving SQL data for system.scheduled_jobs... writing output: debug/system.scheduled_jobs.txt... done +[cluster] retrieving SQL data for system.settings... writing output: debug/system.settings.txt... done +[cluster] retrieving SQL data for system.replication_stats... writing output: debug/system.replication_stats.txt... done +[cluster] retrieving SQL data for system.replication_critical_localities... writing output: debug/system.replication_critical_localities.txt... done +[cluster] retrieving SQL data for system.replication_constraint_stats... writing output: debug/system.replication_constraint_stats.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_schema_statements... writing output: debug/crdb_internal.create_schema_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_statements... writing output: debug/crdb_internal.create_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_type_statements... writing output: debug/crdb_internal.create_type_statements.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_liveness... writing output: debug/crdb_internal.kv_node_liveness.txt... +[cluster] retrieving SQL data for crdb_internal.kv_node_liveness: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[cluster] retrieving SQL data for crdb_internal.kv_node_liveness: creating error output: debug/crdb_internal.kv_node_liveness.txt.err.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_status... writing output: debug/crdb_internal.kv_node_status.txt... +[cluster] retrieving SQL data for crdb_internal.kv_node_status: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[cluster] retrieving SQL data for crdb_internal.kv_node_status: creating error output: debug/crdb_internal.kv_node_status.txt.err.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_store_status... writing output: debug/crdb_internal.kv_store_status.txt... +[cluster] retrieving SQL data for crdb_internal.kv_store_status: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[cluster] retrieving SQL data for crdb_internal.kv_store_status: creating error output: debug/crdb_internal.kv_store_status.txt.err.txt... done +[cluster] retrieving SQL data for crdb_internal.regions... writing output: debug/crdb_internal.regions.txt... done +[cluster] retrieving SQL data for crdb_internal.schema_changes... writing output: debug/crdb_internal.schema_changes.txt... done +[cluster] retrieving SQL data for crdb_internal.partitions... writing output: debug/crdb_internal.partitions.txt... done +[cluster] retrieving SQL data for crdb_internal.zones... writing output: debug/crdb_internal.zones.txt... done +[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done +[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done +[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done +[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done +[cluster] requesting liveness... received response... +[cluster] requesting liveness: last request failed: rpc error: ... +[cluster] requesting liveness: creating error output: debug/liveness.json.err.txt... done +[cluster] requesting CPU profiles +[cluster] profiles generated +[cluster] profile for node 1... +[cluster] profile for node 1: last request failed: rpc error: ... +[cluster] profile for node 1: creating error output: debug/nodes/1/cpu.pprof.err.txt... done +[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done +[node 1] using SQL connection URL: postgresql://... +[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... +[node 1] retrieving SQL data for crdb_internal.gossip_alerts: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[node 1] retrieving SQL data for crdb_internal.gossip_alerts: creating error output: debug/nodes/1/crdb_internal.gossip_alerts.txt.err.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... +[node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... +[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... +[node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode +[node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done +[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done +[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_contention_events... writing output: debug/nodes/1/crdb_internal.node_contention_events.txt... done +[node 1] retrieving SQL data for crdb_internal.node_distsql_flows... writing output: debug/nodes/1/crdb_internal.node_distsql_flows.txt... done +[node 1] retrieving SQL data for crdb_internal.node_inflight_trace_spans... writing output: debug/nodes/1/crdb_internal.node_inflight_trace_spans.txt... done +[node 1] retrieving SQL data for crdb_internal.node_metrics... writing output: debug/nodes/1/crdb_internal.node_metrics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_queries... writing output: debug/nodes/1/crdb_internal.node_queries.txt... done +[node 1] retrieving SQL data for crdb_internal.node_runtime_info... writing output: debug/nodes/1/crdb_internal.node_runtime_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_sessions... writing output: debug/nodes/1/crdb_internal.node_sessions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_statement_statistics... writing output: debug/nodes/1/crdb_internal.node_statement_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transaction_statistics... writing output: debug/nodes/1/crdb_internal.node_transaction_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transactions... writing output: debug/nodes/1/crdb_internal.node_transactions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_txn_stats... writing output: debug/nodes/1/crdb_internal.node_txn_stats.txt... done +[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done +[node 1] requesting data for debug/nodes/1/details... received response... converting to JSON... writing binary output: debug/nodes/1/details.json... done +[node 1] requesting data for debug/nodes/1/gossip... received response... +[node 1] requesting data for debug/nodes/1/gossip: last request failed: rpc error: ... +[node 1] requesting data for debug/nodes/1/gossip: creating error output: debug/nodes/1/gossip.json.err.txt... done +[node 1] requesting data for debug/nodes/1/enginestats... received response... +[node 1] requesting data for debug/nodes/1/enginestats: last request failed: rpc error: ... +[node 1] requesting data for debug/nodes/1/enginestats: creating error output: debug/nodes/1/enginestats.json.err.txt... done +[node 1] requesting stacks... received response... +[node 1] requesting stacks: last request failed: rpc error: ... +[node 1] requesting stacks: creating error output: debug/nodes/1/stacks.txt.err.txt... done +[node 1] requesting stacks with labels... received response... +[node 1] requesting stacks with labels: last request failed: rpc error: ... +[node 1] requesting stacks with labels: creating error output: debug/nodes/1/stacks_with_labels.txt.err.txt... done +[node 1] requesting heap profile... received response... +[node 1] requesting heap profile: last request failed: rpc error: ... +[node 1] requesting heap profile: creating error output: debug/nodes/1/heap.pprof.err.txt... done +[node 1] requesting heap file list... received response... +[node 1] requesting heap file list: last request failed: rpc error: ... +[node 1] requesting heap file list: creating error output: debug/nodes/1/heapprof.err.txt... done +[node 1] requesting goroutine dump list... received response... +[node 1] requesting goroutine dump list: last request failed: rpc error: ... +[node 1] requesting goroutine dump list: creating error output: debug/nodes/1/goroutines.err.txt... done +[node 1] requesting log file ... +[node 1] requesting log file ... +[node 1] requesting log file ... +[node 1] requesting ranges... received response... +[node 1] requesting ranges: last request failed: rpc error: ... +[node 1] requesting ranges: creating error output: debug/nodes/1/ranges.err.txt... done +[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done +[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done diff --git a/pkg/cli/testutils.go b/pkg/cli/testutils.go index d279179926fd..774a142f4fbf 100644 --- a/pkg/cli/testutils.go +++ b/pkg/cli/testutils.go @@ -50,6 +50,7 @@ func TestingReset() { // TestCLI wraps a test server and is used by tests to make assertions about the output of CLI commands. type TestCLI struct { *server.TestServer + tenant serverutils.TestTenantInterface certsDir string cleanupFunc func() error prevStderr *os.File @@ -80,6 +81,9 @@ type TestCLIParams struct { // NoNodelocal, if true, disables node-local external I/O storage. NoNodelocal bool + + // Multitenant, if true, starts the test in multitenant mode. + Multitenant bool } // testTempFilePrefix is a sentinel marker to be used as the prefix of a @@ -155,6 +159,16 @@ func newCLITestWithArgs(params TestCLIParams, argsFn func(args *base.TestServerA log.Infof(context.Background(), "SQL listener at %s", c.ServingSQLAddr()) } + if params.Multitenant { + if c.TestServer == nil { + c.fail(errors.AssertionFailedf("multitenant mode for CLI requires a DB server, try setting `NoServer` argument to false")) + } + tenantArgs := base.TestTenantArgs{TenantID: serverutils.TestTenantID()} + if c.Insecure() { + tenantArgs.ForceInsecure = true + } + c.tenant, _ = serverutils.StartTenant(c.t, c.TestServer, tenantArgs) + } baseCfg.User = security.NodeUserName() // Ensure that CLI error messages and anything meant for the @@ -203,6 +217,14 @@ func (c *TestCLI) RestartServer(params TestCLIParams) { c.TestServer = s.(*server.TestServer) log.Infof(context.Background(), "restarted server at %s / %s", c.ServingRPCAddr(), c.ServingSQLAddr()) + if params.Multitenant { + tenantArgs := base.TestTenantArgs{TenantID: serverutils.TestTenantID()} + if c.Insecure() { + tenantArgs.ForceInsecure = true + } + c.tenant, _ = serverutils.StartTenant(c.t, c.TestServer, tenantArgs) + log.Infof(context.Background(), "restarted tenant SQL only server at %s", c.tenant.SQLAddr()) + } } // Cleanup cleans up after the test, stopping the server if necessary. @@ -306,6 +328,20 @@ func isSQLCommand(args []string) (bool, error) { return false, nil } +func (c TestCLI) getRPCAddr() string { + if c.tenant != nil { + return c.tenant.RPCAddr() + } + return c.ServingRPCAddr() +} + +func (c TestCLI) getSQLAddr() string { + if c.tenant != nil { + return c.tenant.SQLAddr() + } + return c.ServingSQLAddr() +} + // RunWithArgs add args according to TestCLI cfg. func (c TestCLI) RunWithArgs(origArgs []string) { TestingReset() @@ -313,11 +349,11 @@ func (c TestCLI) RunWithArgs(origArgs []string) { if err := func() error { args := append([]string(nil), origArgs[:1]...) if c.TestServer != nil { - addr := c.ServingRPCAddr() + addr := c.getRPCAddr() if isSQL, err := isSQLCommand(origArgs); err != nil { return err } else if isSQL { - addr = c.ServingSQLAddr() + addr = c.getSQLAddr() } h, p, err := net.SplitHostPort(addr) if err != nil { @@ -331,6 +367,7 @@ func (c TestCLI) RunWithArgs(origArgs []string) { args = append(args, fmt.Sprintf("--certs-dir=%s", c.certsDir)) } } + args = append(args, origArgs[1:]...) // `nodelocal upload` and `userfile upload -r` CLI tests create unique temp diff --git a/pkg/cli/zip.go b/pkg/cli/zip.go index 6bd8713b6c14..4dc08d1fc1dd 100644 --- a/pkg/cli/zip.go +++ b/pkg/cli/zip.go @@ -25,7 +25,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/heapprofiler" "github.com/cockroachdb/cockroach/pkg/server/serverpb" - "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/errors" @@ -95,8 +94,8 @@ func (zc *debugZipContext) runZipRequest(ctx context.Context, zr *zipReporter, r // forAllNodes runs fn on every node, possibly concurrently. func (zc *debugZipContext) forAllNodes( ctx context.Context, - nodeList []statuspb.NodeStatus, - fn func(ctx context.Context, node statuspb.NodeStatus) error, + nodeList []serverpb.NodeDetails, + fn func(ctx context.Context, node serverpb.NodeDetails) error, ) error { if zipCtx.concurrency == 1 { // Sequential case. Simplify. @@ -116,7 +115,7 @@ func (zc *debugZipContext) forAllNodes( var wg sync.WaitGroup for _, node := range nodeList { wg.Add(1) - go func(node statuspb.NodeStatus) { + go func(node serverpb.NodeDetails) { defer wg.Done() if err := zc.sem.Acquire(ctx, 1); err != nil { nodeErrs <- err @@ -139,7 +138,7 @@ func (zc *debugZipContext) forAllNodes( type nodeLivenesses = map[roachpb.NodeID]livenesspb.NodeLivenessStatus -func runDebugZip(cmd *cobra.Command, args []string) (retErr error) { +func runDebugZip(_ *cobra.Command, args []string) (retErr error) { if err := zipCtx.files.validate(); err != nil { return err } @@ -229,6 +228,9 @@ func runDebugZip(cmd *cobra.Command, args []string) (retErr error) { } // Fetch the cluster-wide details. + // For a SQL only server, the nodeList will be a list of SQL nodes + // and livenessByNodeID is null. For a KV server, the nodeList will + // be a list of KV nodes along with the corresponding node liveness data. nodeList, livenessByNodeID, err := zc.collectClusterData(ctx, firstNodeDetails) if err != nil { return err @@ -241,7 +243,7 @@ func runDebugZip(cmd *cobra.Command, args []string) (retErr error) { } // Collect the per-node data. - if err := zc.forAllNodes(ctx, nodeList, func(ctx context.Context, node statuspb.NodeStatus) error { + if err := zc.forAllNodes(ctx, nodeList, func(ctx context.Context, node serverpb.NodeDetails) error { return zc.collectPerNodeData(ctx, node, livenessByNodeID) }); err != nil { return err diff --git a/pkg/cli/zip_cluster_wide.go b/pkg/cli/zip_cluster_wide.go index 0900f161a0d4..527019d4f766 100644 --- a/pkg/cli/zip_cluster_wide.go +++ b/pkg/cli/zip_cluster_wide.go @@ -17,8 +17,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/serverpb" - "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" "github.com/cockroachdb/errors" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) const ( @@ -111,11 +112,30 @@ var debugZipTablesPerCluster = []string{ "crdb_internal.table_indexes", } +// getNodesList constructs a NodesListResponse using the Nodes API. We need this while building +// the nodes list for older servers that don't support the new NodesList API. +func (zc *debugZipContext) getNodesList(ctx context.Context) (*serverpb.NodesListResponse, error) { + nodes, err := zc.status.Nodes(ctx, &serverpb.NodesRequest{}) + if err != nil { + return nil, err + } + nodesList := &serverpb.NodesListResponse{} + for _, node := range nodes.Nodes { + nodeDetails := serverpb.NodeDetails{ + NodeID: int32(node.Desc.NodeID), + Address: node.Desc.Address, + SQLAddress: node.Desc.SQLAddress, + } + nodesList.Nodes = append(nodesList.Nodes, nodeDetails) + } + return nodesList, nil +} + // collectClusterData runs the data collection that only needs to // occur once for the entire cluster. func (zc *debugZipContext) collectClusterData( ctx context.Context, firstNodeDetails *serverpb.DetailsResponse, -) (nodeList []statuspb.NodeStatus, livenessByNodeID nodeLivenesses, err error) { +) (nodeList []serverpb.NodeDetails, livenessByNodeID nodeLivenesses, err error) { clusterWideZipRequests := makeClusterWideZipRequests(zc.admin, zc.status) for _, r := range clusterWideZipRequests { @@ -135,10 +155,15 @@ func (zc *debugZipContext) collectClusterData( } { - var nodes *serverpb.NodesResponse + var nodes *serverpb.NodesListResponse s := zc.clusterPrinter.start("requesting nodes") err := zc.runZipFn(ctx, s, func(ctx context.Context) error { - nodes, err = zc.status.Nodes(ctx, &serverpb.NodesRequest{}) + nodes, err = zc.status.NodesList(ctx, &serverpb.NodesListRequest{}) + if code := status.Code(errors.Cause(err)); code == codes.Unimplemented { + // Fallback to the old Nodes API; this could occur while connecting to + // an older node which does not have the NodesList API implemented. + nodes, err = zc.getNodesList(ctx) + } return err }) if cErr := zc.z.createJSONOrError(s, debugBase+"/nodes.json", nodes, err); cErr != nil { @@ -149,11 +174,12 @@ func (zc *debugZipContext) collectClusterData( // still want to inspect the per-node endpoints on the head // node. As per the above, we were able to connect at least to // that. - nodeList = []statuspb.NodeStatus{{Desc: roachpb.NodeDescriptor{ - NodeID: firstNodeDetails.NodeID, + nodeList = []serverpb.NodeDetails{{ + NodeID: int32(firstNodeDetails.NodeID), Address: firstNodeDetails.Address, SQLAddress: firstNodeDetails.SQLAddress, - }}} + }} + if nodes != nil { // If the nodes were found, use that instead. nodeList = nodes.Nodes @@ -174,6 +200,5 @@ func (zc *debugZipContext) collectClusterData( livenessByNodeID = lresponse.Statuses } } - return nodeList, livenessByNodeID, nil } diff --git a/pkg/cli/zip_per_node.go b/pkg/cli/zip_per_node.go index daa8b5475d64..2ae714d32560 100644 --- a/pkg/cli/zip_per_node.go +++ b/pkg/cli/zip_per_node.go @@ -21,8 +21,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/cli/clisqlclient" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/serverpb" - "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -30,11 +30,9 @@ import ( "github.com/cockroachdb/errors" ) -// makePreNodeZipRequests defines the zipRequests (API requests) that are to be +// makePerNodeZipRequests defines the zipRequests (API requests) that are to be // performed once per node. -func makePerNodeZipRequests( - prefix, id string, admin serverpb.AdminClient, status serverpb.StatusClient, -) []zipRequest { +func makePerNodeZipRequests(prefix, id string, status serverpb.StatusClient) []zipRequest { return []zipRequest{ { fn: func(ctx context.Context) (interface{}, error) { @@ -93,8 +91,9 @@ var debugZipTablesPerNode = []string{ // // This is called first and in isolation, before other zip operations // possibly influence the nodes. +// TODO(rima): Collect profiles for tenant SQL nodes. func (zc *debugZipContext) collectCPUProfiles( - ctx context.Context, nodeList []statuspb.NodeStatus, livenessByNodeID nodeLivenesses, + ctx context.Context, nodeList []serverpb.NodeDetails, livenessByNodeID nodeLivenesses, ) error { if zipCtx.cpuProfDuration <= 0 { // Nothing to do; return early. @@ -112,7 +111,8 @@ func (zc *debugZipContext) collectCPUProfiles( // NB: this takes care not to produce non-deterministic log output. resps := make([]profData, len(nodeList)) for i := range nodeList { - if livenessByNodeID[nodeList[i].Desc.NodeID] == livenesspb.NodeLivenessStatus_DECOMMISSIONED { + nodeID := roachpb.NodeID(nodeList[i].NodeID) + if livenessByNodeID[nodeID] == livenesspb.NodeLivenessStatus_DECOMMISSIONED { continue } wg.Add(1) @@ -127,7 +127,7 @@ func (zc *debugZipContext) collectCPUProfiles( var pd profData err := contextutil.RunWithTimeout(ctx, "fetch cpu profile", zc.timeout+zipCtx.cpuProfDuration, func(ctx context.Context) error { resp, err := zc.status.Profile(ctx, &serverpb.ProfileRequest{ - NodeId: fmt.Sprintf("%d", nodeList[i].Desc.NodeID), + NodeId: fmt.Sprintf("%d", nodeID), Type: serverpb.ProfileRequest_CPU, Seconds: secs, Labels: true, @@ -153,7 +153,7 @@ func (zc *debugZipContext) collectCPUProfiles( if len(pd.data) == 0 && pd.err == nil { continue // skipped node } - nodeID := nodeList[i].Desc.NodeID + nodeID := nodeList[i].NodeID prefix := fmt.Sprintf("%s/%s", nodesPrefix, fmt.Sprintf("%d", nodeID)) s := zc.clusterPrinter.start("profile for node %d", nodeID) if err := zc.z.createRawOrError(s, prefix+"/cpu.pprof", pd.data, pd.err); err != nil { @@ -164,25 +164,26 @@ func (zc *debugZipContext) collectCPUProfiles( } func (zc *debugZipContext) collectPerNodeData( - ctx context.Context, node statuspb.NodeStatus, livenessByNodeID nodeLivenesses, + ctx context.Context, node serverpb.NodeDetails, livenessByNodeID nodeLivenesses, ) error { - nodeID := node.Desc.NodeID - - liveness := livenessByNodeID[nodeID] - if liveness == livenesspb.NodeLivenessStatus_DECOMMISSIONED { - // Decommissioned + process terminated. Let's not waste time - // on this node. - // - // NB: we still inspect DECOMMISSIONING nodes (marked as - // decommissioned but the process is still alive) to get a - // chance to collect their log files. - // - // NB: we still inspect DEAD nodes because even though they - // don't heartbeat their liveness record their process might - // still be up and willing to deliver some log files. - return nil + nodeID := roachpb.NodeID(node.NodeID) + + if livenessByNodeID != nil { + liveness := livenessByNodeID[nodeID] + if liveness == livenesspb.NodeLivenessStatus_DECOMMISSIONED { + // Decommissioned + process terminated. Let's not waste time + // on this node. + // + // NB: we still inspect DECOMMISSIONING nodes (marked as + // decommissioned but the process is still alive) to get a + // chance to collect their log files. + // + // NB: we still inspect DEAD nodes because even though they + // don't heartbeat their liveness record their process might + // still be up and willing to deliver some log files. + return nil + } } - nodePrinter := zipCtx.newZipReporter("node %d", nodeID) id := fmt.Sprintf("%d", nodeID) prefix := fmt.Sprintf("%s/%s", nodesPrefix, id) @@ -206,7 +207,10 @@ func (zc *debugZipContext) collectPerNodeData( // not work and if it doesn't, we let the invalid curSQLConn get // used anyway so that anything that does *not* need it will // still happen. - sqlAddr := node.Desc.CheckedSQLAddress() + sqlAddr := node.SQLAddress + if sqlAddr.IsEmpty() { + sqlAddr = node.Address + } curSQLConn := guessNodeURL(zc.firstNodeSQLConn.GetURL(), sqlAddr.AddressField) nodePrinter.info("using SQL connection URL: %s", curSQLConn.GetURL()) @@ -220,7 +224,7 @@ func (zc *debugZipContext) collectPerNodeData( } } - perNodeZipRequests := makePerNodeZipRequests(prefix, id, zc.admin, zc.status) + perNodeZipRequests := makePerNodeZipRequests(prefix, id, zc.status) for _, r := range perNodeZipRequests { if err := zc.runZipRequest(ctx, nodePrinter, r); err != nil { diff --git a/pkg/cli/zip_tenant_test.go b/pkg/cli/zip_tenant_test.go new file mode 100644 index 000000000000..ebfea6a61bd2 --- /dev/null +++ b/pkg/cli/zip_tenant_test.go @@ -0,0 +1,63 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package cli + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl" + "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/datadriven" +) + +// Dummy import to pull in kvtenantccl. This allows us to start tenants. +// We need ccl functionality in order to test debug zip for tenant servers. +var _ = kvtenantccl.Connector{} + +// TestTenantZip tests the operation of zip for a tenant server. +func TestTenantZip(t *testing.T) { + defer leaktest.AfterTest(t)() + + skip.UnderRace(t, "test too slow under race") + + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + + c := NewCLITest(TestCLIParams{ + StoreSpecs: []base.StoreSpec{{ + Path: dir, + }}, + Multitenant: true, + Insecure: true, + }) + defer c.Cleanup() + + out, err := c.RunWithCapture("debug zip --concurrency=1 " + os.DevNull) + if err != nil { + t.Fatal(err) + } + + // Strip any non-deterministic messages. + out = eraseNonDeterministicZipOutput(out) + + // We use datadriven simply to read the golden output file; we don't actually + // run any commands. Using datadriven allows TESTFLAGS=-rewrite. + datadriven.RunTest(t, + testutils.TestDataPath(t, "zip", "testzip_tenant"), + func(t *testing.T, td *datadriven.TestData) string { + return out + }, + ) +} diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index b2db4f229663..a8b635519cba 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -76,7 +76,6 @@ table_name NOT IN ( 'databases', 'forward_dependencies', 'index_columns', - 'interleaved', 'lost_descriptors_with_data', 'table_columns', 'table_row_statistics', diff --git a/pkg/cmd/dev/cache.go b/pkg/cmd/dev/cache.go index d585c0dc819c..9a9e590706ce 100644 --- a/pkg/cmd/dev/cache.go +++ b/pkg/cmd/dev/cache.go @@ -256,7 +256,7 @@ func (d *dev) cleanCache(ctx context.Context) error { if err != nil { return err } - return os.RemoveAll(dir) + return os.RemoveAll(filepath.Join(dir, "cache")) } func (d *dev) getCacheBazelrcLine(ctx context.Context) (string, error) { diff --git a/pkg/server/serverpb/status.proto b/pkg/server/serverpb/status.proto index d2f3cf665f35..68ebbbe70ba0 100644 --- a/pkg/server/serverpb/status.proto +++ b/pkg/server/serverpb/status.proto @@ -308,6 +308,40 @@ message RegionsResponse { map regions = 1; } +// NodesListRequest requests list of all nodes. +// The nodes are KV nodes when the cluster is a single +// tenant cluster or the host cluster in case of multi-tenant +// clusters. +// The nodes are SQL instances in case of multi-tenant +// clusters. +message NodesListRequest {} + +message NodeDetails { + // node_id is a unique identifier for the node. This corresponds + // to SQL instance ID for a tenant server and KV node id for + // for a KV server. + int32 node_id = 1 [(gogoproto.customname) = "NodeID"]; + // address is the RPC address for a KV node. This will be set to + // null for a tenant server node. + util.UnresolvedAddr address = 2 [(gogoproto.nullable) = false]; + // sql_address is the SQL address for a node. + util.UnresolvedAddr sql_address = 3 [(gogoproto.nullable) = false, + (gogoproto.customname) = "SQLAddress"]; +} + +// NodesListResponse contains a list of all nodes with their addresses. +// The nodes are KV nodes when the cluster is a single +// tenant cluster or the host cluster in case of multi-tenant +// clusters. +// The nodes are SQL instances in case of multi-tenant +// clusters. +message NodesListResponse { + // nodes contains a list of NodeDetails. Each individual node within + // the list is a SQL node in case of a tenant server and KV nodes in + // case of a KV server. + repeated NodeDetails nodes = 1 [(gogoproto.nullable) = false]; +} + message NodeRequest { // node_id is a string so that "local" can be used to specify that no // forwarding is necessary. @@ -1436,6 +1470,9 @@ service Status { // RegionsRequest retrieves all available regions. rpc Regions(RegionsRequest) returns (RegionsResponse) {} + // NodesList returns all available nodes with their addresses. + rpc NodesList(NodesListRequest) returns (NodesListResponse) {} + // Nodes returns status info for all commissioned nodes. Decommissioned nodes // are not included, except in rare cases where the node doing the // decommissioning crashed before completing the operation. In these cases, diff --git a/pkg/server/status.go b/pkg/server/status.go index 381179c00ec3..5d409ff4f0f7 100644 --- a/pkg/server/status.go +++ b/pkg/server/status.go @@ -1346,6 +1346,32 @@ func regionsResponseFromNodesResponse(nr *serverpb.NodesResponse) *serverpb.Regi return ret } +// NodesList returns a list of nodes with their corresponding addresses. +func (s *statusServer) NodesList( + ctx context.Context, _ *serverpb.NodesListRequest, +) (*serverpb.NodesListResponse, error) { + // The node status contains details about the command line, network + // addresses, env vars etc which are admin-only. + if _, err := s.privilegeChecker.requireAdminUser(ctx); err != nil { + return nil, err + } + ctx = propagateGatewayMetadata(ctx) + ctx = s.AnnotateCtx(ctx) + statuses, _, err := s.getNodeStatuses(ctx, 0 /* limit */, 0 /* offset */) + if err != nil { + return nil, err + } + resp := &serverpb.NodesListResponse{ + Nodes: make([]serverpb.NodeDetails, len(statuses)), + } + for i, status := range statuses { + resp.Nodes[i].NodeID = int32(status.Desc.NodeID) + resp.Nodes[i].Address = status.Desc.Address + resp.Nodes[i].SQLAddress = status.Desc.SQLAddress + } + return resp, nil +} + // Nodes returns all node statuses. // // Do not use this method inside the server code! Use @@ -1503,12 +1529,9 @@ func (s *statusServer) ListNodesInternal( return resp, err } -func (s *statusServer) nodesHelper( +func (s *statusServer) getNodeStatuses( ctx context.Context, limit, offset int, -) (*serverpb.NodesResponse, int, error) { - ctx = propagateGatewayMetadata(ctx) - ctx = s.AnnotateCtx(ctx) - +) (statuses []statuspb.NodeStatus, next int, _ error) { startKey := keys.StatusNodePrefix endKey := startKey.PrefixEnd() @@ -1519,7 +1542,6 @@ func (s *statusServer) nodesHelper( return nil, 0, status.Errorf(codes.Internal, err.Error()) } - var next int var rows []kv.KeyValue if len(b.Results[0].Rows) > 0 { var rowsInterface interface{} @@ -1527,15 +1549,29 @@ func (s *statusServer) nodesHelper( rows = rowsInterface.([]kv.KeyValue) } - resp := serverpb.NodesResponse{ - Nodes: make([]statuspb.NodeStatus, len(rows)), - } + statuses = make([]statuspb.NodeStatus, len(rows)) for i, row := range rows { - if err := row.ValueProto(&resp.Nodes[i]); err != nil { + if err := row.ValueProto(&statuses[i]); err != nil { log.Errorf(ctx, "%v", err) return nil, 0, status.Errorf(codes.Internal, err.Error()) } } + return statuses, next, nil +} + +func (s *statusServer) nodesHelper( + ctx context.Context, limit, offset int, +) (*serverpb.NodesResponse, int, error) { + ctx = propagateGatewayMetadata(ctx) + ctx = s.AnnotateCtx(ctx) + + statuses, next, err := s.getNodeStatuses(ctx, limit, offset) + if err != nil { + return nil, 0, err + } + resp := serverpb.NodesResponse{ + Nodes: statuses, + } clock := s.admin.server.clock resp.LivenessByNodeID = getLivenessStatusMap(s.nodeLiveness, clock.Now().GoTime(), s.st) diff --git a/pkg/server/tenant_status.go b/pkg/server/tenant_status.go index b12043d82961..fa29df30bd0e 100644 --- a/pkg/server/tenant_status.go +++ b/pkg/server/tenant_status.go @@ -23,6 +23,7 @@ import ( "strings" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/security" @@ -33,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/contention" "github.com/cockroachdb/cockroach/pkg/sql/flowinfra" "github.com/cockroachdb/cockroach/pkg/sql/sqlinstance" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/contextutil" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/quotapool" @@ -886,3 +888,75 @@ func (t *tenantStatusServer) TableIndexStats( return getTableIndexUsageStats(ctx, req, t.sqlServer.pgServer.SQLServer.GetLocalIndexStatistics(), t.sqlServer.internalExecutor) } + +// Details returns information for a given instance ID such as +// the instance address and build info. +func (t *tenantStatusServer) Details( + ctx context.Context, req *serverpb.DetailsRequest, +) (*serverpb.DetailsResponse, error) { + ctx = propagateGatewayMetadata(ctx) + ctx = t.AnnotateCtx(ctx) + + if err := t.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil { + return nil, err + } + + if t.sqlServer.SQLInstanceID() == 0 { + return nil, status.Errorf(codes.Unavailable, "instanceID not set") + } + + instanceID, local, err := t.parseInstanceID(req.NodeId) + if err != nil { + return nil, status.Errorf(codes.InvalidArgument, err.Error()) + } + if !local { + instance, err := t.sqlServer.sqlInstanceProvider.GetInstance(ctx, instanceID) + if err != nil { + return nil, err + } + status, err := t.dialPod(ctx, instanceID, instance.InstanceAddr) + if err != nil { + return nil, err + } + return status.Details(ctx, req) + } + localInstance, err := t.sqlServer.sqlInstanceProvider.GetInstance(ctx, t.sqlServer.SQLInstanceID()) + if err != nil { + return nil, status.Errorf(codes.Unavailable, "local instance unavailable") + } + resp := &serverpb.DetailsResponse{ + NodeID: roachpb.NodeID(instanceID), + BuildInfo: build.GetInfo(), + SQLAddress: util.MakeUnresolvedAddr("tcp", localInstance.InstanceAddr), + } + + return resp, nil +} + +func (t *tenantStatusServer) NodesList( + ctx context.Context, req *serverpb.NodesListRequest, +) (*serverpb.NodesListResponse, error) { + ctx = propagateGatewayMetadata(ctx) + ctx = t.AnnotateCtx(ctx) + + // The node list contains details about the network addresses which are admin-only. + if _, err := t.privilegeChecker.requireAdminUser(ctx); err != nil { + return nil, err + } + instances, err := t.sqlServer.sqlInstanceProvider.GetAllInstances(ctx) + if err != nil { + return nil, err + } + var resp serverpb.NodesListResponse + for _, instance := range instances { + // For SQL only servers, the (RPC) Address and SQL address is the same. + // TODO(#76175): We should split the instance address into SQL and RPC addresses. + nodeDetails := serverpb.NodeDetails{ + NodeID: int32(instance.InstanceID), + Address: util.MakeUnresolvedAddr("tcp", instance.InstanceAddr), + SQLAddress: util.MakeUnresolvedAddr("tcp", instance.InstanceAddr), + } + resp.Nodes = append(resp.Nodes, nodeDetails) + } + return &resp, err +} diff --git a/pkg/server/testserver.go b/pkg/server/testserver.go index 3bb134e9c1ff..17116c14196b 100644 --- a/pkg/server/testserver.go +++ b/pkg/server/testserver.go @@ -527,6 +527,17 @@ func (t *TestTenant) HTTPAddr() string { return t.httpAddr } +// RPCAddr is part of the TestTenantInterface interface. +func (t *TestTenant) RPCAddr() string { + // The RPC and SQL functionality for tenants is multiplexed + // on the same address. Having a separate interface to access + // for the two addresses makes it easier to distinguish + // the use case for which the address is being used. + // This also provides parity between SQL only servers and + // regular servers. + return t.sqlAddr +} + // PGServer is part of TestTenantInterface. func (t *TestTenant) PGServer() interface{} { return t.pgServer diff --git a/pkg/testutils/serverutils/test_tenant_shim.go b/pkg/testutils/serverutils/test_tenant_shim.go index 7eb2092b98b7..67df41fa7d81 100644 --- a/pkg/testutils/serverutils/test_tenant_shim.go +++ b/pkg/testutils/serverutils/test_tenant_shim.go @@ -40,6 +40,9 @@ type TestTenantInterface interface { // HTTPAddr returns the tenant's http address. HTTPAddr() string + // RPCAddr returns the tenant's RPC address. + RPCAddr() string + // PGServer returns the tenant's *pgwire.Server as an interface{}. PGServer() interface{}