diff --git a/pkg/cli/cliflags/flags.go b/pkg/cli/cliflags/flags.go index 86ad7172df8c..63f5b338da33 100644 --- a/pkg/cli/cliflags/flags.go +++ b/pkg/cli/cliflags/flags.go @@ -1647,8 +1647,16 @@ necessary to support CockroachDB. ZipIncludeRangeInfo = FlagInfo{ Name: "include-range-info", Description: ` -Include information about each individual range in nodes/*/ranges/*.json files. -For large clusters, this can dramatically increase debug zip size/file count. +Include one file per node with information about the KV ranges stored on that node, +in nodes/{node ID}/ranges.json. Additionally, include problem ranges information. +This information can be vital when debugging issues that involve the KV storage layer, +such as data placement, load balancing, performance or other behaviors. In certain situations, +on large clusters with large numbers of ranges, these files can be omitted if and only if the +issue being investigated is already known to be in another layer of the system (for example, +an error message about an unsupported feature or incompatible value in a SQL schema change or +statement). Note however many higher-level issues are ultimately related to the underlying KV +storage layer described by these files so only set this to false if directed to do so by Cockroach +Labs support. `, } diff --git a/pkg/cli/testdata/zip/testzip_exclude_range_info b/pkg/cli/testdata/zip/testzip_exclude_range_info new file mode 100644 index 000000000000..9f83ab96ce25 --- /dev/null +++ b/pkg/cli/testdata/zip/testzip_exclude_range_info @@ -0,0 +1,117 @@ +zip +---- +debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false /dev/null +[cluster] discovering tenants on cluster... done +[cluster] creating output file /dev/null... done +[cluster] establishing RPC connection to ... +[cluster] using SQL address: ... +[cluster] requesting data for debug/events... received response... writing JSON output: debug/events.json... done +[cluster] requesting data for debug/rangelog... received response... writing JSON output: debug/rangelog.json... done +[cluster] requesting data for debug/settings... received response... writing JSON output: debug/settings.json... done +[cluster] retrieving SQL data for "".crdb_internal.create_function_statements... writing output: debug/crdb_internal.create_function_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_schema_statements... writing output: debug/crdb_internal.create_schema_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_statements... writing output: debug/crdb_internal.create_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_type_statements... writing output: debug/crdb_internal.create_type_statements.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_settings... writing output: debug/crdb_internal.cluster_settings.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_transactions... writing output: debug/crdb_internal.cluster_transactions.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_txn_execution_insights... writing output: debug/crdb_internal.cluster_txn_execution_insights.txt... done +[cluster] retrieving SQL data for crdb_internal.default_privileges... writing output: debug/crdb_internal.default_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done +[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done +[cluster] retrieving SQL data for crdb_internal.jobs... writing output: debug/crdb_internal.jobs.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_liveness... writing output: debug/crdb_internal.kv_node_liveness.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_status... writing output: debug/crdb_internal.kv_node_status.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_store_status... writing output: debug/crdb_internal.kv_store_status.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_system_privileges... writing output: debug/crdb_internal.kv_system_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.partitions... writing output: debug/crdb_internal.partitions.txt... done +[cluster] retrieving SQL data for crdb_internal.regions... writing output: debug/crdb_internal.regions.txt... done +[cluster] retrieving SQL data for crdb_internal.schema_changes... writing output: debug/crdb_internal.schema_changes.txt... done +[cluster] retrieving SQL data for crdb_internal.super_regions... writing output: debug/crdb_internal.super_regions.txt... done +[cluster] retrieving SQL data for crdb_internal.system_jobs... writing output: debug/crdb_internal.system_jobs.txt... done +[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done +[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done +[cluster] retrieving SQL data for crdb_internal.zones... writing output: debug/crdb_internal.zones.txt... done +[cluster] retrieving SQL data for system.database_role_settings... writing output: debug/system.database_role_settings.txt... done +[cluster] retrieving SQL data for system.descriptor... writing output: debug/system.descriptor.txt... done +[cluster] retrieving SQL data for system.eventlog... writing output: debug/system.eventlog.txt... done +[cluster] retrieving SQL data for system.external_connections... writing output: debug/system.external_connections.txt... done +[cluster] retrieving SQL data for system.job_info... writing output: debug/system.job_info.txt... done +[cluster] retrieving SQL data for system.jobs... writing output: debug/system.jobs.txt... done +[cluster] retrieving SQL data for system.lease... writing output: debug/system.lease.txt... done +[cluster] retrieving SQL data for system.locations... writing output: debug/system.locations.txt... done +[cluster] retrieving SQL data for system.migrations... writing output: debug/system.migrations.txt... done +[cluster] retrieving SQL data for system.namespace... writing output: debug/system.namespace.txt... done +[cluster] retrieving SQL data for system.privileges... writing output: debug/system.privileges.txt... done +[cluster] retrieving SQL data for system.protected_ts_meta... writing output: debug/system.protected_ts_meta.txt... done +[cluster] retrieving SQL data for system.protected_ts_records... writing output: debug/system.protected_ts_records.txt... done +[cluster] retrieving SQL data for system.rangelog... writing output: debug/system.rangelog.txt... done +[cluster] retrieving SQL data for system.replication_constraint_stats... writing output: debug/system.replication_constraint_stats.txt... done +[cluster] retrieving SQL data for system.replication_critical_localities... writing output: debug/system.replication_critical_localities.txt... done +[cluster] retrieving SQL data for system.replication_stats... writing output: debug/system.replication_stats.txt... done +[cluster] retrieving SQL data for system.reports_meta... writing output: debug/system.reports_meta.txt... done +[cluster] retrieving SQL data for system.role_id_seq... writing output: debug/system.role_id_seq.txt... done +[cluster] retrieving SQL data for system.role_members... writing output: debug/system.role_members.txt... done +[cluster] retrieving SQL data for system.role_options... writing output: debug/system.role_options.txt... done +[cluster] retrieving SQL data for system.scheduled_jobs... writing output: debug/system.scheduled_jobs.txt... done +[cluster] retrieving SQL data for system.settings... writing output: debug/system.settings.txt... done +[cluster] retrieving SQL data for system.span_configurations... writing output: debug/system.span_configurations.txt... done +[cluster] retrieving SQL data for system.sql_instances... writing output: debug/system.sql_instances.txt... done +[cluster] retrieving SQL data for system.sqlliveness... writing output: debug/system.sqlliveness.txt... done +[cluster] retrieving SQL data for system.statement_diagnostics... writing output: debug/system.statement_diagnostics.txt... done +[cluster] retrieving SQL data for system.statement_diagnostics_requests... writing output: debug/system.statement_diagnostics_requests.txt... done +[cluster] retrieving SQL data for system.statement_statistics_limit_5000... writing output: debug/system.statement_statistics_limit_5000.txt... done +[cluster] retrieving SQL data for system.table_statistics... writing output: debug/system.table_statistics.txt... done +[cluster] retrieving SQL data for system.task_payloads... writing output: debug/system.task_payloads.txt... done +[cluster] retrieving SQL data for system.tenant_settings... writing output: debug/system.tenant_settings.txt... done +[cluster] retrieving SQL data for system.tenant_tasks... writing output: debug/system.tenant_tasks.txt... done +[cluster] retrieving SQL data for system.tenant_usage... writing output: debug/system.tenant_usage.txt... done +[cluster] retrieving SQL data for system.tenants... writing output: debug/system.tenants.txt... done +[cluster] requesting nodes... received response... writing JSON output: debug/nodes.json... done +[cluster] requesting liveness... received response... writing JSON output: debug/liveness.json... done +[cluster] requesting CPU profiles +[cluster] profiles generated +[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done +[node 1] node status... writing JSON output: debug/nodes/1/status.json... done +[node 1] using SQL connection URL: postgresql://... +[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done +[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done +[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done +[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_contention_events... writing output: debug/nodes/1/crdb_internal.node_contention_events.txt... done +[node 1] retrieving SQL data for crdb_internal.node_distsql_flows... writing output: debug/nodes/1/crdb_internal.node_distsql_flows.txt... done +[node 1] retrieving SQL data for crdb_internal.node_execution_insights... writing output: debug/nodes/1/crdb_internal.node_execution_insights.txt... done +[node 1] retrieving SQL data for crdb_internal.node_inflight_trace_spans... writing output: debug/nodes/1/crdb_internal.node_inflight_trace_spans.txt... done +[node 1] retrieving SQL data for crdb_internal.node_memory_monitors... writing output: debug/nodes/1/crdb_internal.node_memory_monitors.txt... done +[node 1] retrieving SQL data for crdb_internal.node_metrics... writing output: debug/nodes/1/crdb_internal.node_metrics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_queries... writing output: debug/nodes/1/crdb_internal.node_queries.txt... done +[node 1] retrieving SQL data for crdb_internal.node_runtime_info... writing output: debug/nodes/1/crdb_internal.node_runtime_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_sessions... writing output: debug/nodes/1/crdb_internal.node_sessions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_statement_statistics... writing output: debug/nodes/1/crdb_internal.node_statement_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_tenant_capabilities_cache... writing output: debug/nodes/1/crdb_internal.node_tenant_capabilities_cache.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transaction_statistics... writing output: debug/nodes/1/crdb_internal.node_transaction_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transactions... writing output: debug/nodes/1/crdb_internal.node_transactions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_txn_execution_insights... writing output: debug/nodes/1/crdb_internal.node_txn_execution_insights.txt... done +[node 1] retrieving SQL data for crdb_internal.node_txn_stats... writing output: debug/nodes/1/crdb_internal.node_txn_stats.txt... done +[node 1] requesting data for debug/nodes/1/details... received response... writing JSON output: debug/nodes/1/details.json... done +[node 1] requesting data for debug/nodes/1/gossip... received response... writing JSON output: debug/nodes/1/gossip.json... done +[node 1] requesting data for debug/nodes/1/enginestats... received response... writing JSON output: debug/nodes/1/enginestats.json... done +[node 1] requesting stacks... received response... writing binary output: debug/nodes/1/stacks.txt... done +[node 1] requesting stacks with labels... received response... writing binary output: debug/nodes/1/stacks_with_labels.txt... done +[node 1] requesting heap profile... received response... writing binary output: debug/nodes/1/heap.pprof... done +[node 1] requesting heap file list... received response... done +[node ?] ? heap profiles found +[node 1] requesting goroutine dump list... received response... done +[node ?] ? goroutine dumps found +[node 1] requesting log files list... received response... done +[node ?] ? log files found +[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done diff --git a/pkg/cli/zip_cluster_wide.go b/pkg/cli/zip_cluster_wide.go index 660514cabe72..fc41cf81b40f 100644 --- a/pkg/cli/zip_cluster_wide.go +++ b/pkg/cli/zip_cluster_wide.go @@ -41,7 +41,7 @@ const ( func makeClusterWideZipRequests( admin serverpb.AdminClient, status serverpb.StatusClient, prefix string, ) []zipRequest { - return []zipRequest{ + zipRequests := []zipRequest{ // NB: we intentionally omit liveness since it's already pulled manually (we // act on the output to special case decommissioned nodes). { @@ -62,13 +62,16 @@ func makeClusterWideZipRequests( }, pathName: prefix + settingsName, }, - { + } + if zipCtx.includeRangeInfo { + zipRequests = append(zipRequests, zipRequest{ fn: func(ctx context.Context) (interface{}, error) { return status.ProblemRanges(ctx, &serverpb.ProblemRangesRequest{}) }, pathName: prefix + problemRangesName, - }, + }) } + return zipRequests } // collectClusterData runs the data collection that only needs to diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index 634192da83b9..05532b5588fe 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -254,6 +254,40 @@ func TestZipIncludeRangeInfo(t *testing.T) { ) } +// This tests the operation of zip using --include-range-info=false. +func TestZipExcludeRangeInfo(t *testing.T) { + defer leaktest.AfterTest(t)() + + skip.UnderRace(t, "test too slow under race") + + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + + c := NewCLITest(TestCLIParams{ + StoreSpecs: []base.StoreSpec{{ + Path: dir, + }}, + }) + defer c.Cleanup() + + out, err := c.RunWithCapture( + "debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false " + os.DevNull) + if err != nil { + t.Fatal(err) + } + + // Strip any non-deterministic messages. + out = eraseNonDeterministicZipOutput(out) + + // We use datadriven simply to read the golden output file; we don't actually + // run any commands. Using datadriven allows TESTFLAGS=-rewrite. + datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "testzip_exclude_range_info"), + func(t *testing.T, td *datadriven.TestData) string { + return out + }, + ) +} + // This tests the operation of zip running concurrently. func TestConcurrentZip(t *testing.T) { defer leaktest.AfterTest(t)()