From 80c47dbf52e389b131752d41c68b89d58b644869 Mon Sep 17 00:00:00 2001 From: Namrata Kodali Date: Wed, 14 Feb 2024 06:51:22 -0800 Subject: [PATCH] cli: --include-range-info flag for `cockroach debug zip` also toggles problem ranges Previously, the --include-range-info flag only toggled inclusion of one file per node with information on KV ranges on that node. This change also toggles inclusion of problem ranges with the same flag, as there is suspected cluster performance degradation for large clusters when fetching problem ranges. Fixes: #118991 Release note (ops change): Expanded --include-range-info flag to additionally include problem ranges. This is still defaulted to true. Release justification: low risk cli change to debug zip flag --- pkg/cli/cliflags/flags.go | 12 +- .../testdata/zip/testzip_exclude_range_info | 117 ++++++++++++++++++ pkg/cli/zip_cluster_wide.go | 9 +- pkg/cli/zip_test.go | 34 +++++ 4 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 pkg/cli/testdata/zip/testzip_exclude_range_info diff --git a/pkg/cli/cliflags/flags.go b/pkg/cli/cliflags/flags.go index 86ad7172df8c..63f5b338da33 100644 --- a/pkg/cli/cliflags/flags.go +++ b/pkg/cli/cliflags/flags.go @@ -1647,8 +1647,16 @@ necessary to support CockroachDB. ZipIncludeRangeInfo = FlagInfo{ Name: "include-range-info", Description: ` -Include information about each individual range in nodes/*/ranges/*.json files. -For large clusters, this can dramatically increase debug zip size/file count. +Include one file per node with information about the KV ranges stored on that node, +in nodes/{node ID}/ranges.json. Additionally, include problem ranges information. +This information can be vital when debugging issues that involve the KV storage layer, +such as data placement, load balancing, performance or other behaviors. In certain situations, +on large clusters with large numbers of ranges, these files can be omitted if and only if the +issue being investigated is already known to be in another layer of the system (for example, +an error message about an unsupported feature or incompatible value in a SQL schema change or +statement). Note however many higher-level issues are ultimately related to the underlying KV +storage layer described by these files so only set this to false if directed to do so by Cockroach +Labs support. `, } diff --git a/pkg/cli/testdata/zip/testzip_exclude_range_info b/pkg/cli/testdata/zip/testzip_exclude_range_info new file mode 100644 index 000000000000..9f83ab96ce25 --- /dev/null +++ b/pkg/cli/testdata/zip/testzip_exclude_range_info @@ -0,0 +1,117 @@ +zip +---- +debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false /dev/null +[cluster] discovering tenants on cluster... done +[cluster] creating output file /dev/null... done +[cluster] establishing RPC connection to ... +[cluster] using SQL address: ... +[cluster] requesting data for debug/events... received response... writing JSON output: debug/events.json... done +[cluster] requesting data for debug/rangelog... received response... writing JSON output: debug/rangelog.json... done +[cluster] requesting data for debug/settings... received response... writing JSON output: debug/settings.json... done +[cluster] retrieving SQL data for "".crdb_internal.create_function_statements... writing output: debug/crdb_internal.create_function_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_schema_statements... writing output: debug/crdb_internal.create_schema_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_statements... writing output: debug/crdb_internal.create_statements.txt... done +[cluster] retrieving SQL data for "".crdb_internal.create_type_statements... writing output: debug/crdb_internal.create_type_statements.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_settings... writing output: debug/crdb_internal.cluster_settings.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_transactions... writing output: debug/crdb_internal.cluster_transactions.txt... done +[cluster] retrieving SQL data for crdb_internal.cluster_txn_execution_insights... writing output: debug/crdb_internal.cluster_txn_execution_insights.txt... done +[cluster] retrieving SQL data for crdb_internal.default_privileges... writing output: debug/crdb_internal.default_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done +[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done +[cluster] retrieving SQL data for crdb_internal.jobs... writing output: debug/crdb_internal.jobs.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_liveness... writing output: debug/crdb_internal.kv_node_liveness.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_node_status... writing output: debug/crdb_internal.kv_node_status.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_store_status... writing output: debug/crdb_internal.kv_store_status.txt... done +[cluster] retrieving SQL data for crdb_internal.kv_system_privileges... writing output: debug/crdb_internal.kv_system_privileges.txt... done +[cluster] retrieving SQL data for crdb_internal.partitions... writing output: debug/crdb_internal.partitions.txt... done +[cluster] retrieving SQL data for crdb_internal.regions... writing output: debug/crdb_internal.regions.txt... done +[cluster] retrieving SQL data for crdb_internal.schema_changes... writing output: debug/crdb_internal.schema_changes.txt... done +[cluster] retrieving SQL data for crdb_internal.super_regions... writing output: debug/crdb_internal.super_regions.txt... done +[cluster] retrieving SQL data for crdb_internal.system_jobs... writing output: debug/crdb_internal.system_jobs.txt... done +[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done +[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done +[cluster] retrieving SQL data for crdb_internal.zones... writing output: debug/crdb_internal.zones.txt... done +[cluster] retrieving SQL data for system.database_role_settings... writing output: debug/system.database_role_settings.txt... done +[cluster] retrieving SQL data for system.descriptor... writing output: debug/system.descriptor.txt... done +[cluster] retrieving SQL data for system.eventlog... writing output: debug/system.eventlog.txt... done +[cluster] retrieving SQL data for system.external_connections... writing output: debug/system.external_connections.txt... done +[cluster] retrieving SQL data for system.job_info... writing output: debug/system.job_info.txt... done +[cluster] retrieving SQL data for system.jobs... writing output: debug/system.jobs.txt... done +[cluster] retrieving SQL data for system.lease... writing output: debug/system.lease.txt... done +[cluster] retrieving SQL data for system.locations... writing output: debug/system.locations.txt... done +[cluster] retrieving SQL data for system.migrations... writing output: debug/system.migrations.txt... done +[cluster] retrieving SQL data for system.namespace... writing output: debug/system.namespace.txt... done +[cluster] retrieving SQL data for system.privileges... writing output: debug/system.privileges.txt... done +[cluster] retrieving SQL data for system.protected_ts_meta... writing output: debug/system.protected_ts_meta.txt... done +[cluster] retrieving SQL data for system.protected_ts_records... writing output: debug/system.protected_ts_records.txt... done +[cluster] retrieving SQL data for system.rangelog... writing output: debug/system.rangelog.txt... done +[cluster] retrieving SQL data for system.replication_constraint_stats... writing output: debug/system.replication_constraint_stats.txt... done +[cluster] retrieving SQL data for system.replication_critical_localities... writing output: debug/system.replication_critical_localities.txt... done +[cluster] retrieving SQL data for system.replication_stats... writing output: debug/system.replication_stats.txt... done +[cluster] retrieving SQL data for system.reports_meta... writing output: debug/system.reports_meta.txt... done +[cluster] retrieving SQL data for system.role_id_seq... writing output: debug/system.role_id_seq.txt... done +[cluster] retrieving SQL data for system.role_members... writing output: debug/system.role_members.txt... done +[cluster] retrieving SQL data for system.role_options... writing output: debug/system.role_options.txt... done +[cluster] retrieving SQL data for system.scheduled_jobs... writing output: debug/system.scheduled_jobs.txt... done +[cluster] retrieving SQL data for system.settings... writing output: debug/system.settings.txt... done +[cluster] retrieving SQL data for system.span_configurations... writing output: debug/system.span_configurations.txt... done +[cluster] retrieving SQL data for system.sql_instances... writing output: debug/system.sql_instances.txt... done +[cluster] retrieving SQL data for system.sqlliveness... writing output: debug/system.sqlliveness.txt... done +[cluster] retrieving SQL data for system.statement_diagnostics... writing output: debug/system.statement_diagnostics.txt... done +[cluster] retrieving SQL data for system.statement_diagnostics_requests... writing output: debug/system.statement_diagnostics_requests.txt... done +[cluster] retrieving SQL data for system.statement_statistics_limit_5000... writing output: debug/system.statement_statistics_limit_5000.txt... done +[cluster] retrieving SQL data for system.table_statistics... writing output: debug/system.table_statistics.txt... done +[cluster] retrieving SQL data for system.task_payloads... writing output: debug/system.task_payloads.txt... done +[cluster] retrieving SQL data for system.tenant_settings... writing output: debug/system.tenant_settings.txt... done +[cluster] retrieving SQL data for system.tenant_tasks... writing output: debug/system.tenant_tasks.txt... done +[cluster] retrieving SQL data for system.tenant_usage... writing output: debug/system.tenant_usage.txt... done +[cluster] retrieving SQL data for system.tenants... writing output: debug/system.tenants.txt... done +[cluster] requesting nodes... received response... writing JSON output: debug/nodes.json... done +[cluster] requesting liveness... received response... writing JSON output: debug/liveness.json... done +[cluster] requesting CPU profiles +[cluster] profiles generated +[cluster] profile for node 1... writing binary output: debug/nodes/1/cpu.pprof... done +[node 1] node status... writing JSON output: debug/nodes/1/status.json... done +[node 1] using SQL connection URL: postgresql://... +[node 1] retrieving SQL data for crdb_internal.active_range_feeds... writing output: debug/nodes/1/crdb_internal.active_range_feeds.txt... done +[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done +[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done +[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done +[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_contention_events... writing output: debug/nodes/1/crdb_internal.node_contention_events.txt... done +[node 1] retrieving SQL data for crdb_internal.node_distsql_flows... writing output: debug/nodes/1/crdb_internal.node_distsql_flows.txt... done +[node 1] retrieving SQL data for crdb_internal.node_execution_insights... writing output: debug/nodes/1/crdb_internal.node_execution_insights.txt... done +[node 1] retrieving SQL data for crdb_internal.node_inflight_trace_spans... writing output: debug/nodes/1/crdb_internal.node_inflight_trace_spans.txt... done +[node 1] retrieving SQL data for crdb_internal.node_memory_monitors... writing output: debug/nodes/1/crdb_internal.node_memory_monitors.txt... done +[node 1] retrieving SQL data for crdb_internal.node_metrics... writing output: debug/nodes/1/crdb_internal.node_metrics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_queries... writing output: debug/nodes/1/crdb_internal.node_queries.txt... done +[node 1] retrieving SQL data for crdb_internal.node_runtime_info... writing output: debug/nodes/1/crdb_internal.node_runtime_info.txt... done +[node 1] retrieving SQL data for crdb_internal.node_sessions... writing output: debug/nodes/1/crdb_internal.node_sessions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_statement_statistics... writing output: debug/nodes/1/crdb_internal.node_statement_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_tenant_capabilities_cache... writing output: debug/nodes/1/crdb_internal.node_tenant_capabilities_cache.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transaction_statistics... writing output: debug/nodes/1/crdb_internal.node_transaction_statistics.txt... done +[node 1] retrieving SQL data for crdb_internal.node_transactions... writing output: debug/nodes/1/crdb_internal.node_transactions.txt... done +[node 1] retrieving SQL data for crdb_internal.node_txn_execution_insights... writing output: debug/nodes/1/crdb_internal.node_txn_execution_insights.txt... done +[node 1] retrieving SQL data for crdb_internal.node_txn_stats... writing output: debug/nodes/1/crdb_internal.node_txn_stats.txt... done +[node 1] requesting data for debug/nodes/1/details... received response... writing JSON output: debug/nodes/1/details.json... done +[node 1] requesting data for debug/nodes/1/gossip... received response... writing JSON output: debug/nodes/1/gossip.json... done +[node 1] requesting data for debug/nodes/1/enginestats... received response... writing JSON output: debug/nodes/1/enginestats.json... done +[node 1] requesting stacks... received response... writing binary output: debug/nodes/1/stacks.txt... done +[node 1] requesting stacks with labels... received response... writing binary output: debug/nodes/1/stacks_with_labels.txt... done +[node 1] requesting heap profile... received response... writing binary output: debug/nodes/1/heap.pprof... done +[node 1] requesting heap file list... received response... done +[node ?] ? heap profiles found +[node 1] requesting goroutine dump list... received response... done +[node ?] ? goroutine dumps found +[node 1] requesting log files list... received response... done +[node ?] ? log files found +[cluster] pprof summary script... writing binary output: debug/pprof-summary.sh... done diff --git a/pkg/cli/zip_cluster_wide.go b/pkg/cli/zip_cluster_wide.go index 660514cabe72..fc41cf81b40f 100644 --- a/pkg/cli/zip_cluster_wide.go +++ b/pkg/cli/zip_cluster_wide.go @@ -41,7 +41,7 @@ const ( func makeClusterWideZipRequests( admin serverpb.AdminClient, status serverpb.StatusClient, prefix string, ) []zipRequest { - return []zipRequest{ + zipRequests := []zipRequest{ // NB: we intentionally omit liveness since it's already pulled manually (we // act on the output to special case decommissioned nodes). { @@ -62,13 +62,16 @@ func makeClusterWideZipRequests( }, pathName: prefix + settingsName, }, - { + } + if zipCtx.includeRangeInfo { + zipRequests = append(zipRequests, zipRequest{ fn: func(ctx context.Context) (interface{}, error) { return status.ProblemRanges(ctx, &serverpb.ProblemRangesRequest{}) }, pathName: prefix + problemRangesName, - }, + }) } + return zipRequests } // collectClusterData runs the data collection that only needs to diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index 634192da83b9..05532b5588fe 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -254,6 +254,40 @@ func TestZipIncludeRangeInfo(t *testing.T) { ) } +// This tests the operation of zip using --include-range-info=false. +func TestZipExcludeRangeInfo(t *testing.T) { + defer leaktest.AfterTest(t)() + + skip.UnderRace(t, "test too slow under race") + + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + + c := NewCLITest(TestCLIParams{ + StoreSpecs: []base.StoreSpec{{ + Path: dir, + }}, + }) + defer c.Cleanup() + + out, err := c.RunWithCapture( + "debug zip --concurrency=1 --cpu-profile-duration=1s --include-range-info=false " + os.DevNull) + if err != nil { + t.Fatal(err) + } + + // Strip any non-deterministic messages. + out = eraseNonDeterministicZipOutput(out) + + // We use datadriven simply to read the golden output file; we don't actually + // run any commands. Using datadriven allows TESTFLAGS=-rewrite. + datadriven.RunTest(t, datapathutils.TestDataPath(t, "zip", "testzip_exclude_range_info"), + func(t *testing.T, td *datadriven.TestData) string { + return out + }, + ) +} + // This tests the operation of zip running concurrently. func TestConcurrentZip(t *testing.T) { defer leaktest.AfterTest(t)()