From def365820d1b51746be00e70cb9b7b4d602a2c08 Mon Sep 17 00:00:00 2001 From: Jeffrey Smith II <jsmith@influxdata.com> Date: Tue, 10 Jan 2023 13:37:11 -0500 Subject: [PATCH] feat: add the ability to log queries killed by `query-timeout` (#23978) * feat: add the ability to log queries killed by `query-timeout` * chore: update example config * chore: improve logging details --- cmd/influxd/run/server.go | 7 +++++-- coordinator/config.go | 3 +++ etc/config.sample.toml | 3 +++ query/task_manager.go | 11 +++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/cmd/influxd/run/server.go b/cmd/influxd/run/server.go index 6ac186c97aa..805f5ecc6e7 100644 --- a/cmd/influxd/run/server.go +++ b/cmd/influxd/run/server.go @@ -234,6 +234,7 @@ func NewServer(c *Config, buildInfo *BuildInfo) (*Server, error) { s.QueryExecutor.TaskManager.QueryTimeout = time.Duration(c.Coordinator.QueryTimeout) s.QueryExecutor.TaskManager.LogQueriesAfter = time.Duration(c.Coordinator.LogQueriesAfter) s.QueryExecutor.TaskManager.MaxConcurrentQueries = c.Coordinator.MaxConcurrentQueries + s.QueryExecutor.TaskManager.LogTimedoutQueries = c.Coordinator.LogTimedOutQueries // Initialize the monitor s.Monitor.Version = s.buildInfo.Version @@ -465,8 +466,10 @@ func (s *Server) Open() error { s.TSDBStore.WithLogger(s.Logger) if s.config.Data.QueryLogEnabled { s.QueryExecutor.WithLogger(s.Logger) - } else if s.config.Coordinator.LogQueriesAfter > 0 { - // Log long-running queries even if not logging all queries + } else if s.config.Coordinator.LogQueriesAfter > 0 || s.config.Coordinator.LogTimedOutQueries { + // If we need to do any logging, add a logger. + // The TaskManager properly handles both of the above configs + // so it only logs as is appropriate. s.QueryExecutor.TaskManager.Logger = s.Logger } s.PointsWriter.WithLogger(s.Logger) diff --git a/coordinator/config.go b/coordinator/config.go index 7ab306b0421..b26affbe7c5 100644 --- a/coordinator/config.go +++ b/coordinator/config.go @@ -33,6 +33,7 @@ type Config struct { MaxConcurrentQueries int `toml:"max-concurrent-queries"` QueryTimeout toml.Duration `toml:"query-timeout"` LogQueriesAfter toml.Duration `toml:"log-queries-after"` + LogTimedOutQueries bool `toml:"log-timedout-queries"` MaxSelectPointN int `toml:"max-select-point"` MaxSelectSeriesN int `toml:"max-select-series"` MaxSelectBucketsN int `toml:"max-select-buckets"` @@ -48,6 +49,7 @@ func NewConfig() Config { MaxSelectPointN: DefaultMaxSelectPointN, MaxSelectSeriesN: DefaultMaxSelectSeriesN, TerminationQueryLog: false, + LogTimedOutQueries: false, } } @@ -58,6 +60,7 @@ func (c Config) Diagnostics() (*diagnostics.Diagnostics, error) { "max-concurrent-queries": c.MaxConcurrentQueries, "query-timeout": c.QueryTimeout, "log-queries-after": c.LogQueriesAfter, + "log-timedout-queries": c.LogTimedOutQueries, "max-select-point": c.MaxSelectPointN, "max-select-series": c.MaxSelectSeriesN, "max-select-buckets": c.MaxSelectBucketsN, diff --git a/etc/config.sample.toml b/etc/config.sample.toml index bba30b1ab23..22c659114de 100644 --- a/etc/config.sample.toml +++ b/etc/config.sample.toml @@ -185,6 +185,9 @@ # discover slow or resource intensive queries. Setting the value to 0 disables the slow query logging. # log-queries-after = "0s" + # Enables the logging of queries that are killed as a result of exceeding `query-timeout` + # log-timedout-queries = false + # The maximum number of points a SELECT can process. A value of 0 will make # the maximum point count unlimited. This will only be checked every second so queries will not # be aborted immediately when hitting the limit. diff --git a/query/task_manager.go b/query/task_manager.go index 159da6666f9..a05f3156a40 100644 --- a/query/task_manager.go +++ b/query/task_manager.go @@ -72,6 +72,9 @@ type TaskManager struct { // If zero, slow queries will never be logged. LogQueriesAfter time.Duration + // If true, queries that are killed due to `query-timeout` will be logged. + LogTimedoutQueries bool + // Maximum number of concurrent queries. MaxConcurrentQueries int @@ -315,6 +318,14 @@ func (t *TaskManager) waitForQuery(qid uint64, interrupt <-chan struct{}, closin t.queryError(qid, err) case <-timerCh: + if t.LogTimedoutQueries { + t.Logger.Warn( + "query killed for exceeding timeout limit", + zap.String("query", t.queries[qid].query), + zap.String("database", t.queries[qid].database), + zap.String("timeout", prettyTime(t.QueryTimeout).String()), + ) + } t.queryError(qid, ErrQueryTimeoutLimitExceeded) case <-interrupt: // Query was manually closed so exit the select.