Skip to content

Commit

Permalink
cli: Set up basic debug zip for tenants.
Browse files Browse the repository at this point in the history
This PR adds basic functionality for tenants
by adding a flag to the debug.zip command to connect
to a tenant server. It also adds code to dump system and
crdb_internal tables to a zip file pulled from a tenant.

This PR adds a new API to StatusClient which is
returns a list of (KV/SQL) nodes for a cluster with their
corresponding addresses.

This PR does not implement the functionality to dump
goroutines, profile information and logs. That will
be added in subsequent PRs.

Release note (cli change): This PR extends the debug
zip CLI command to support exporting system and
crdb_internal tables to a zip folder for tenants.

Release note (api change): StatusClient interface has been
extended with a new request called NodesListRequest. This
request returns a list of KV nodes for KV servers and
SQL nodes for SQL only servers with their corresponding
SQL and RPC addresses.
  • Loading branch information
rimadeodhar committed Feb 7, 2022
1 parent 3651e3c commit a945ddb
Show file tree
Hide file tree
Showing 15 changed files with 538 additions and 56 deletions.
65 changes: 65 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,71 @@ RegionsResponse describes the available regions.



## NodesList



NodesList returns all available nodes with their addresses.

Support status: [reserved](#support-status)

#### Request Parameters




NodesListRequest requests list of all nodes.
The nodes are KV nodes when the cluster is a single
tenant cluster or the host cluster in case of multi-tenant
clusters.
The nodes are SQL instances in case of multi-tenant
clusters.








#### Response Parameters




NodesListResponse contains a list of all nodes with their addresses.
The nodes are KV nodes when the cluster is a single
tenant cluster or the host cluster in case of multi-tenant
clusters.
The nodes are SQL instances in case of multi-tenant
clusters.


| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| nodes | [NodeDetails](#cockroach.server.serverpb.NodesListResponse-cockroach.server.serverpb.NodeDetails) | repeated | nodes contains a list of NodeDetails. Each individual node within the list is a SQL node in case of a tenant server and KV nodes in case of a KV server. | [reserved](#support-status) |






<a name="cockroach.server.serverpb.NodesListResponse-cockroach.server.serverpb.NodeDetails"></a>
#### NodeDetails



| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_id | [int32](#cockroach.server.serverpb.NodesListResponse-int32) | | node_id is a unique identifier for the node. This corresponds to SQL instance ID for a tenant server and KV node id for for a KV server. | [reserved](#support-status) |
| address | [cockroach.util.UnresolvedAddr](#cockroach.server.serverpb.NodesListResponse-cockroach.util.UnresolvedAddr) | | address is the RPC address for a KV node. This will be set to null for a tenant server node. | [reserved](#support-status) |
| sql_address | [cockroach.util.UnresolvedAddr](#cockroach.server.serverpb.NodesListResponse-cockroach.util.UnresolvedAddr) | | sql_address is the SQL address for a node. | [reserved](#support-status) |






## Nodes

`GET /_status/nodes`
Expand Down
2 changes: 2 additions & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -313,13 +313,15 @@ go_test(
"userfiletable_test.go",
"workload_test.go",
"zip_helpers_test.go",
"zip_tenant_test.go",
"zip_test.go",
],
data = glob(["testdata/**"]),
embed = [":cli"],
deps = [
"//pkg/base",
"//pkg/build",
"//pkg/ccl/kvccl/kvtenantccl",
"//pkg/cli/clicfg",
"//pkg/cli/clierror",
"//pkg/cli/clierrorplus",
Expand Down
3 changes: 2 additions & 1 deletion pkg/cli/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/security/securitytest"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
)

func init() {
Expand All @@ -29,8 +30,8 @@ func TestMain(m *testing.M) {
// CLI tests are sensitive to the server version, but test binaries don't have
// a version injected. Pretend to be a very up-to-date version.
defer build.TestingOverrideTag("v999.0.0")()

serverutils.InitTestServerFactory(server.TestServerFactory)
serverutils.InitTestClusterFactory(testcluster.TestClusterFactory)
os.Exit(m.Run())
}

Expand Down
123 changes: 123 additions & 0 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
zip
----
debug zip --concurrency=1 /dev/null
[cluster] establishing RPC connection to ...
[cluster] retrieving the node status to get the SQL address... done
[cluster] using SQL address: ...
[cluster] creating output file /dev/null... done
[cluster] requesting data for debug/events... received response...
[cluster] requesting data for debug/events: last request failed: rpc error: ...
[cluster] requesting data for debug/events: creating error output: debug/events.json.err.txt... done
[cluster] requesting data for debug/rangelog... received response...
[cluster] requesting data for debug/rangelog: last request failed: rpc error: ...
[cluster] requesting data for debug/rangelog: creating error output: debug/rangelog.json.err.txt... done
[cluster] requesting data for debug/settings... received response...
[cluster] requesting data for debug/settings: last request failed: rpc error: ...
[cluster] requesting data for debug/settings: creating error output: debug/settings.json.err.txt... done
[cluster] requesting data for debug/reports/problemranges... received response...
[cluster] requesting data for debug/reports/problemranges: last request failed: rpc error: ...
[cluster] requesting data for debug/reports/problemranges: creating error output: debug/reports/problemranges.json.err.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_settings... writing output: debug/crdb_internal.cluster_settings.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_transactions... writing output: debug/crdb_internal.cluster_transactions.txt... done
[cluster] retrieving SQL data for crdb_internal.default_privileges... writing output: debug/crdb_internal.default_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.jobs... writing output: debug/crdb_internal.jobs.txt... done
[cluster] retrieving SQL data for system.jobs... writing output: debug/system.jobs.txt... done
[cluster] retrieving SQL data for system.descriptor... writing output: debug/system.descriptor.txt... done
[cluster] retrieving SQL data for system.namespace... writing output: debug/system.namespace.txt... done
[cluster] retrieving SQL data for system.scheduled_jobs... writing output: debug/system.scheduled_jobs.txt... done
[cluster] retrieving SQL data for system.settings... writing output: debug/system.settings.txt... done
[cluster] retrieving SQL data for system.replication_stats... writing output: debug/system.replication_stats.txt... done
[cluster] retrieving SQL data for system.replication_critical_localities... writing output: debug/system.replication_critical_localities.txt... done
[cluster] retrieving SQL data for system.replication_constraint_stats... writing output: debug/system.replication_constraint_stats.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_schema_statements... writing output: debug/crdb_internal.create_schema_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_statements... writing output: debug/crdb_internal.create_statements.txt... done
[cluster] retrieving SQL data for "".crdb_internal.create_type_statements... writing output: debug/crdb_internal.create_type_statements.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_node_liveness... writing output: debug/crdb_internal.kv_node_liveness.txt...
[cluster] retrieving SQL data for crdb_internal.kv_node_liveness: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[cluster] retrieving SQL data for crdb_internal.kv_node_liveness: creating error output: debug/crdb_internal.kv_node_liveness.txt.err.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_node_status... writing output: debug/crdb_internal.kv_node_status.txt...
[cluster] retrieving SQL data for crdb_internal.kv_node_status: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[cluster] retrieving SQL data for crdb_internal.kv_node_status: creating error output: debug/crdb_internal.kv_node_status.txt.err.txt... done
[cluster] retrieving SQL data for crdb_internal.kv_store_status... writing output: debug/crdb_internal.kv_store_status.txt...
[cluster] retrieving SQL data for crdb_internal.kv_store_status: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[cluster] retrieving SQL data for crdb_internal.kv_store_status: creating error output: debug/crdb_internal.kv_store_status.txt.err.txt... done
[cluster] retrieving SQL data for crdb_internal.regions... writing output: debug/crdb_internal.regions.txt... done
[cluster] retrieving SQL data for crdb_internal.schema_changes... writing output: debug/crdb_internal.schema_changes.txt... done
[cluster] retrieving SQL data for crdb_internal.partitions... writing output: debug/crdb_internal.partitions.txt... done
[cluster] retrieving SQL data for crdb_internal.zones... writing output: debug/crdb_internal.zones.txt... done
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response...
[cluster] requesting liveness: last request failed: rpc error: ...
[cluster] requesting liveness: creating error output: debug/liveness.json.err.txt... done
[cluster] requesting CPU profiles
[cluster] profiles generated
[cluster] profile for node 1...
[cluster] profile for node 1: last request failed: rpc error: ...
[cluster] profile for node 1: creating error output: debug/nodes/1/cpu.pprof.err.txt... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
[node 1] using SQL connection URL: postgresql://...
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_alerts: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[node 1] retrieving SQL data for crdb_internal.gossip_alerts: creating error output: debug/nodes/1/crdb_internal.gossip_alerts.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: pq: unimplemented: operation is unsupported in multi-tenancy mode
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
[node 1] retrieving SQL data for crdb_internal.node_contention_events... writing output: debug/nodes/1/crdb_internal.node_contention_events.txt... done
[node 1] retrieving SQL data for crdb_internal.node_distsql_flows... writing output: debug/nodes/1/crdb_internal.node_distsql_flows.txt... done
[node 1] retrieving SQL data for crdb_internal.node_inflight_trace_spans... writing output: debug/nodes/1/crdb_internal.node_inflight_trace_spans.txt... done
[node 1] retrieving SQL data for crdb_internal.node_metrics... writing output: debug/nodes/1/crdb_internal.node_metrics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_queries... writing output: debug/nodes/1/crdb_internal.node_queries.txt... done
[node 1] retrieving SQL data for crdb_internal.node_runtime_info... writing output: debug/nodes/1/crdb_internal.node_runtime_info.txt... done
[node 1] retrieving SQL data for crdb_internal.node_sessions... writing output: debug/nodes/1/crdb_internal.node_sessions.txt... done
[node 1] retrieving SQL data for crdb_internal.node_statement_statistics... writing output: debug/nodes/1/crdb_internal.node_statement_statistics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_transaction_statistics... writing output: debug/nodes/1/crdb_internal.node_transaction_statistics.txt... done
[node 1] retrieving SQL data for crdb_internal.node_transactions... writing output: debug/nodes/1/crdb_internal.node_transactions.txt... done
[node 1] retrieving SQL data for crdb_internal.node_txn_stats... writing output: debug/nodes/1/crdb_internal.node_txn_stats.txt... done
[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done
[node 1] requesting data for debug/nodes/1/details... received response... converting to JSON... writing binary output: debug/nodes/1/details.json... done
[node 1] requesting data for debug/nodes/1/gossip... received response...
[node 1] requesting data for debug/nodes/1/gossip: last request failed: rpc error: ...
[node 1] requesting data for debug/nodes/1/gossip: creating error output: debug/nodes/1/gossip.json.err.txt... done
[node 1] requesting data for debug/nodes/1/enginestats... received response...
[node 1] requesting data for debug/nodes/1/enginestats: last request failed: rpc error: ...
[node 1] requesting data for debug/nodes/1/enginestats: creating error output: debug/nodes/1/enginestats.json.err.txt... done
[node 1] requesting stacks... received response...
[node 1] requesting stacks: last request failed: rpc error: ...
[node 1] requesting stacks: creating error output: debug/nodes/1/stacks.txt.err.txt... done
[node 1] requesting stacks with labels... received response...
[node 1] requesting stacks with labels: last request failed: rpc error: ...
[node 1] requesting stacks with labels: creating error output: debug/nodes/1/stacks_with_labels.txt.err.txt... done
[node 1] requesting heap profile... received response...
[node 1] requesting heap profile: last request failed: rpc error: ...
[node 1] requesting heap profile: creating error output: debug/nodes/1/heap.pprof.err.txt... done
[node 1] requesting heap file list... received response...
[node 1] requesting heap file list: last request failed: rpc error: ...
[node 1] requesting heap file list: creating error output: debug/nodes/1/heapprof.err.txt... done
[node 1] requesting goroutine dump list... received response...
[node 1] requesting goroutine dump list: last request failed: rpc error: ...
[node 1] requesting goroutine dump list: creating error output: debug/nodes/1/goroutines.err.txt... done
[node 1] requesting log file ...
[node 1] requesting log file ...
[node 1] requesting log file ...
[node 1] requesting ranges... received response...
[node 1] requesting ranges: last request failed: rpc error: ...
[node 1] requesting ranges: creating error output: debug/nodes/1/ranges.err.txt... done
[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done
[cluster] hot range summary script... writing binary output: debug/hot-ranges.sh... done
41 changes: 39 additions & 2 deletions pkg/cli/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ func TestingReset() {
// TestCLI wraps a test server and is used by tests to make assertions about the output of CLI commands.
type TestCLI struct {
*server.TestServer
tenant serverutils.TestTenantInterface
certsDir string
cleanupFunc func() error
prevStderr *os.File
Expand Down Expand Up @@ -80,6 +81,9 @@ type TestCLIParams struct {

// NoNodelocal, if true, disables node-local external I/O storage.
NoNodelocal bool

// Multitenant, if true, starts the test in multitenant mode.
Multitenant bool
}

// testTempFilePrefix is a sentinel marker to be used as the prefix of a
Expand Down Expand Up @@ -155,6 +159,16 @@ func newCLITestWithArgs(params TestCLIParams, argsFn func(args *base.TestServerA
log.Infof(context.Background(), "SQL listener at %s", c.ServingSQLAddr())
}

if params.Multitenant {
if c.TestServer == nil {
c.fail(errors.AssertionFailedf("multitenant mode for CLI requires a DB server, try setting `NoServer` argument to false"))
}
tenantArgs := base.TestTenantArgs{TenantID: serverutils.TestTenantID()}
if c.Insecure() {
tenantArgs.ForceInsecure = true
}
c.tenant, _ = serverutils.StartTenant(c.t, c.TestServer, tenantArgs)
}
baseCfg.User = security.NodeUserName()

// Ensure that CLI error messages and anything meant for the
Expand Down Expand Up @@ -203,6 +217,14 @@ func (c *TestCLI) RestartServer(params TestCLIParams) {
c.TestServer = s.(*server.TestServer)
log.Infof(context.Background(), "restarted server at %s / %s",
c.ServingRPCAddr(), c.ServingSQLAddr())
if params.Multitenant {
tenantArgs := base.TestTenantArgs{TenantID: serverutils.TestTenantID()}
if c.Insecure() {
tenantArgs.ForceInsecure = true
}
c.tenant, _ = serverutils.StartTenant(c.t, c.TestServer, tenantArgs)
log.Infof(context.Background(), "restarted tenant SQL only server at %s", c.tenant.SQLAddr())
}
}

// Cleanup cleans up after the test, stopping the server if necessary.
Expand Down Expand Up @@ -306,18 +328,32 @@ func isSQLCommand(args []string) (bool, error) {
return false, nil
}

func (c TestCLI) getRPCAddr() string {
if c.tenant != nil {
return c.tenant.RPCAddr()
}
return c.ServingRPCAddr()
}

func (c TestCLI) getSQLAddr() string {
if c.tenant != nil {
return c.tenant.SQLAddr()
}
return c.ServingSQLAddr()
}

// RunWithArgs add args according to TestCLI cfg.
func (c TestCLI) RunWithArgs(origArgs []string) {
TestingReset()

if err := func() error {
args := append([]string(nil), origArgs[:1]...)
if c.TestServer != nil {
addr := c.ServingRPCAddr()
addr := c.getRPCAddr()
if isSQL, err := isSQLCommand(origArgs); err != nil {
return err
} else if isSQL {
addr = c.ServingSQLAddr()
addr = c.getSQLAddr()
}
h, p, err := net.SplitHostPort(addr)
if err != nil {
Expand All @@ -331,6 +367,7 @@ func (c TestCLI) RunWithArgs(origArgs []string) {
args = append(args, fmt.Sprintf("--certs-dir=%s", c.certsDir))
}
}

args = append(args, origArgs[1:]...)

// `nodelocal upload` and `userfile upload -r` CLI tests create unique temp
Expand Down
Loading

0 comments on commit a945ddb

Please sign in to comment.