From 0326230e267d32982642b637150fb52f8bb35733 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Thu, 14 Jul 2022 16:37:12 -0400 Subject: [PATCH] sql: add troubleshooting mode session variable Resolves: #84429 This change introduces a `troubleshooting_mode_enabled` session variable. When enabled, this session variable is intended to be used as a way to avoid performing additional work on queries, particularly when the cluster is experiencing issues/unavailability/failure. By default, this session variable is disabled. Currently, this session variable is only used to avoid collecting/emitting telemetry data. Release note (sql change): Introduce new `troubleshooting_mode_enabled` session variable, to avoid doing additional work on queries when possible (i.e. collection telemetry data). By default, this session variable is disabled. --- pkg/cli/clisqlcfg/context.go | 13 +++ .../test_client_side_checking.tcl | 5 + pkg/sql/exec_log.go | 2 +- pkg/sql/exec_util.go | 4 + .../testdata/logic_test/information_schema | 1 + .../logictest/testdata/logic_test/pg_catalog | 3 + .../logictest/testdata/logic_test/show_source | 1 + pkg/sql/sessiondatapb/session_data.proto | 5 + pkg/sql/telemetry_logging_test.go | 109 ++++++++++++++++++ pkg/sql/vars.go | 17 +++ 10 files changed, 159 insertions(+), 1 deletion(-) diff --git a/pkg/cli/clisqlcfg/context.go b/pkg/cli/clisqlcfg/context.go index fb5e1b9c3cd0..01d7e7a77ae8 100644 --- a/pkg/cli/clisqlcfg/context.go +++ b/pkg/cli/clisqlcfg/context.go @@ -210,6 +210,9 @@ func (c *Context) Run(ctx context.Context, conn clisqlclient.Conn) error { if err := c.maybeSetReadOnly(conn); err != nil { return err } + if err := c.maybeSetTroubleshootingMode(conn); err != nil { + return err + } if c.ConnCtx.DebugMode { fmt.Fprintln(c.CmdOut, @@ -250,3 +253,13 @@ func (c *Context) maybeSetReadOnly(conn clisqlclient.Conn) error { return conn.Exec(context.Background(), "SET default_transaction_read_only = TRUE") } + +func (c *Context) maybeSetTroubleshootingMode(conn clisqlclient.Conn) error { + if !c.ConnCtx.DebugMode { + return nil + } + // If we are in debug mode, enable "troubleshooting mode". + return conn.Exec( + context.Background(), + "SET troubleshooting_mode = on") +} diff --git a/pkg/cli/interactive_tests/test_client_side_checking.tcl b/pkg/cli/interactive_tests/test_client_side_checking.tcl index 8c61e80f75f5..7a504d8650d6 100644 --- a/pkg/cli/interactive_tests/test_client_side_checking.tcl +++ b/pkg/cli/interactive_tests/test_client_side_checking.tcl @@ -87,6 +87,11 @@ eexpect "\"\".crdb_internal.node_build_info" eexpect "you might want to set the current database" eexpect "to the empty string" +# Check that troubleshooting mode is enabled in debug mode. +eexpect "root@" +send "show troubleshooting_mode\r" +eexpect "on" + eexpect "root@" send "\\set display_format csv\r\\set\r" eexpect "check_syntax,false" diff --git a/pkg/sql/exec_log.go b/pkg/sql/exec_log.go index 8488b510aebc..30517df79869 100644 --- a/pkg/sql/exec_log.go +++ b/pkg/sql/exec_log.go @@ -375,7 +375,7 @@ func (p *planner) maybeLogStatementInternal( p.logEventsOnlyExternally(ctx, eventLogEntry{event: &eventpb.AdminQuery{CommonSQLExecDetails: execDetails}}) } - if telemetryLoggingEnabled { + if telemetryLoggingEnabled && !p.SessionData().TroubleshootingMode { // We only log to the telemetry channel if enough time has elapsed from // the last event emission. requiredTimeElapsed := 1.0 / float64(maxEventFrequency) diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 56032b33d0dd..5698dd79bc3d 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -3251,6 +3251,10 @@ func (m *sessionDataMutator) SetUnconstrainedNonCoveringIndexScanEnabled(val boo m.data.UnconstrainedNonCoveringIndexScanEnabled = val } +func (m *sessionDataMutator) SetTroubleshootingModeEnabled(val bool) { + m.data.TroubleshootingMode = val +} + // Utility functions related to scrubbing sensitive information on SQL Stats. // quantizeCounts ensures that the Count field in the diff --git a/pkg/sql/logictest/testdata/logic_test/information_schema b/pkg/sql/logictest/testdata/logic_test/information_schema index 234310929302..588d62797f8c 100644 --- a/pkg/sql/logictest/testdata/logic_test/information_schema +++ b/pkg/sql/logictest/testdata/logic_test/information_schema @@ -4718,6 +4718,7 @@ transaction_rows_read_log 0 transaction_rows_written_err 0 transaction_rows_written_log 0 transaction_status NoTxn +troubleshooting_mode off unconstrained_non_covering_index_scan_enabled off xmloption content diff --git a/pkg/sql/logictest/testdata/logic_test/pg_catalog b/pkg/sql/logictest/testdata/logic_test/pg_catalog index a8cd5c37fd6d..cd6941d9fc1f 100644 --- a/pkg/sql/logictest/testdata/logic_test/pg_catalog +++ b/pkg/sql/logictest/testdata/logic_test/pg_catalog @@ -4243,6 +4243,7 @@ transaction_rows_read_log 0 NULL transaction_rows_written_err 0 NULL NULL NULL string transaction_rows_written_log 0 NULL NULL NULL string transaction_status NoTxn NULL NULL NULL string +troubleshooting_mode off NULL NULL NULL åstring unconstrained_non_covering_index_scan_enabled off NULL NULL NULL string use_declarative_schema_changer on NULL NULL NULL string vectorize on NULL NULL NULL string @@ -4367,6 +4368,7 @@ transaction_rows_read_log 0 NULL transaction_rows_written_err 0 NULL user NULL 0 0 transaction_rows_written_log 0 NULL user NULL 0 0 transaction_status NoTxn NULL user NULL NoTxn NoTxn +troubleshooting_mode off NULL user NULL off off unconstrained_non_covering_index_scan_enabled off NULL user NULL off off use_declarative_schema_changer on NULL user NULL on on vectorize on NULL user NULL on on @@ -4489,6 +4491,7 @@ transaction_rows_read_log NULL NULL NULL transaction_rows_written_err NULL NULL NULL NULL NULL transaction_rows_written_log NULL NULL NULL NULL NULL transaction_status NULL NULL NULL NULL NULL +troubleshooting_mode NULL NULL NULL NULL NULL unconstrained_non_covering_index_scan_enabled NULL NULL NULL NULL NULL use_declarative_schema_changer NULL NULL NULL NULL NULL vectorize NULL NULL NULL NULL NULL diff --git a/pkg/sql/logictest/testdata/logic_test/show_source b/pkg/sql/logictest/testdata/logic_test/show_source index 9fd11d1270e9..3b546931dabd 100644 --- a/pkg/sql/logictest/testdata/logic_test/show_source +++ b/pkg/sql/logictest/testdata/logic_test/show_source @@ -135,6 +135,7 @@ transaction_rows_read_log 0 transaction_rows_written_err 0 transaction_rows_written_log 0 transaction_status NoTxn +troubleshooting_mode off unconstrained_non_covering_index_scan_enabled off use_declarative_schema_changer on vectorize on diff --git a/pkg/sql/sessiondatapb/session_data.proto b/pkg/sql/sessiondatapb/session_data.proto index aba668768a74..c0e4cf1eddd0 100644 --- a/pkg/sql/sessiondatapb/session_data.proto +++ b/pkg/sql/sessiondatapb/session_data.proto @@ -89,6 +89,11 @@ message SessionData { // TrigramSimilarityThreshold configures the value that's used to compare // trigram similarities to in order to evaluate the string % string overload. double trigram_similarity_threshold = 20; + + // Troubleshooting mode determines whether we refuse to do additional work with + // the query (i.e. collect & emit telemetry data). Troubleshooting mode is + // disabled by default. + bool troubleshooting_mode = 21; } // DataConversionConfig contains the parameters that influence the output diff --git a/pkg/sql/telemetry_logging_test.go b/pkg/sql/telemetry_logging_test.go index 349c953330d5..810752c4c10e 100644 --- a/pkg/sql/telemetry_logging_test.go +++ b/pkg/sql/telemetry_logging_test.go @@ -284,3 +284,112 @@ func TestTelemetryLogging(t *testing.T) { } } } + +func TestNoTelemetryLogOnTroubleshootMode(t *testing.T) { + defer leaktest.AfterTest(t)() + sc := log.ScopeWithoutShowLogs(t) + defer sc.Close(t) + + cleanup := installTelemetryLogFileSink(sc, t) + defer cleanup() + + st := stubTime{} + + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{ + Knobs: base.TestingKnobs{ + TelemetryLoggingKnobs: &TelemetryLoggingTestingKnobs{ + getTimeNow: st.TimeNow, + }, + }, + }) + db := sqlutils.MakeSQLRunner(sqlDB) + defer s.Stopper().Stop(context.Background()) + + db.Exec(t, `SET CLUSTER SETTING sql.telemetry.query_sampling.enabled = true;`) + db.Exec(t, "CREATE TABLE t();") + + stubMaxEventFrequency := int64(1) + telemetryMaxEventFrequency.Override(context.Background(), &s.ClusterSettings().SV, stubMaxEventFrequency) + + /* + Testing Cases: + - run query when troubleshoot mode is enabled + - ensure no log appears + - run another query when troubleshoot mode is disabled + - ensure log appears + */ + testData := []struct { + name string + query string + expectedLogStatement string + enableTroubleshootingMode bool + expectedNumLogs int + }{ + { + "select-troubleshooting-enabled", + "SELECT * FROM t LIMIT 1;", + `SELECT * FROM \"\".\"\".t LIMIT ‹1›`, + true, + 0, + }, + { + "select-troubleshooting-disabled", + "SELECT * FROM t LIMIT 2;", + `SELECT * FROM \"\".\"\".t LIMIT ‹2›`, + false, + 1, + }, + } + + for idx, tc := range testData { + // Set the time for when we issue a query to enable/disable + // troubleshooting mode. + setTroubleshootModeTime := timeutil.FromUnixMicros(int64(idx * 1e6)) + st.setTime(setTroubleshootModeTime) + if tc.enableTroubleshootingMode { + db.Exec(t, `SET troubleshooting_mode = true;`) + } else { + db.Exec(t, `SET troubleshooting_mode = false;`) + } + // Advance time 1 second from previous query. Ensure enough time has passed + // from when we set troubleshooting mode for this query to be sampled. + setQueryTime := timeutil.FromUnixMicros(int64((idx + 1) * 1e6)) + st.setTime(setQueryTime) + db.Exec(t, tc.query) + } + + log.Flush() + + entries, err := log.FetchEntriesFromFiles( + 0, + math.MaxInt64, + 10000, + regexp.MustCompile(`"EventType":"sampled_query"`), + log.WithMarkedSensitiveData, + ) + + if err != nil { + t.Fatal(err) + } + + if len(entries) == 0 { + t.Fatal(errors.Newf("no entries found")) + } + + for _, tc := range testData { + numLogsFound := 0 + for i := len(entries) - 1; i >= 0; i-- { + e := entries[i] + if strings.Contains(e.Message, tc.expectedLogStatement) { + if tc.enableTroubleshootingMode { + t.Errorf("%s: unexpected log entry when troubleshooting mode enabled:\n%s", tc.name, entries[0].Message) + } else { + numLogsFound++ + } + } + } + if numLogsFound != tc.expectedNumLogs { + t.Errorf("%s: expected %d log entries, found %d", tc.name, tc.expectedNumLogs, numLogsFound) + } + } +} diff --git a/pkg/sql/vars.go b/pkg/sql/vars.go index e0908419c7e7..293d98aa27c5 100644 --- a/pkg/sql/vars.go +++ b/pkg/sql/vars.go @@ -1160,6 +1160,23 @@ var varGen = map[string]sessionVar{ }, }, + // CockroachDB extension. + `troubleshooting_mode`: { + GetStringVal: makePostgresBoolGetStringValFn(`troubleshooting_mode`), + Set: func(_ context.Context, m sessionDataMutator, s string) error { + b, err := paramparse.ParseBoolVar("troubleshooting_mode", s) + if err != nil { + return err + } + m.SetTroubleshootingModeEnabled(b) + return nil + }, + Get: func(evalCtx *extendedEvalContext, _ *kv.Txn) (string, error) { + return formatBoolAsPostgresSetting(evalCtx.SessionData().TroubleshootingMode), nil + }, + GlobalDefault: globalFalse, + }, + // This is read-only in Postgres also. // See https://www.postgresql.org/docs/14/sql-show.html and // https://www.postgresql.org/docs/14/locale.html