From def365820d1b51746be00e70cb9b7b4d602a2c08 Mon Sep 17 00:00:00 2001
From: Jeffrey Smith II <jsmith@influxdata.com>
Date: Tue, 10 Jan 2023 13:37:11 -0500
Subject: [PATCH] feat: add the ability to log queries killed by
 `query-timeout` (#23978)

* feat: add the ability to log queries killed by `query-timeout`

* chore: update example config

* chore: improve logging details
---
 cmd/influxd/run/server.go |  7 +++++--
 coordinator/config.go     |  3 +++
 etc/config.sample.toml    |  3 +++
 query/task_manager.go     | 11 +++++++++++
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/cmd/influxd/run/server.go b/cmd/influxd/run/server.go
index 6ac186c97aa..805f5ecc6e7 100644
--- a/cmd/influxd/run/server.go
+++ b/cmd/influxd/run/server.go
@@ -234,6 +234,7 @@ func NewServer(c *Config, buildInfo *BuildInfo) (*Server, error) {
 	s.QueryExecutor.TaskManager.QueryTimeout = time.Duration(c.Coordinator.QueryTimeout)
 	s.QueryExecutor.TaskManager.LogQueriesAfter = time.Duration(c.Coordinator.LogQueriesAfter)
 	s.QueryExecutor.TaskManager.MaxConcurrentQueries = c.Coordinator.MaxConcurrentQueries
+	s.QueryExecutor.TaskManager.LogTimedoutQueries = c.Coordinator.LogTimedOutQueries
 
 	// Initialize the monitor
 	s.Monitor.Version = s.buildInfo.Version
@@ -465,8 +466,10 @@ func (s *Server) Open() error {
 	s.TSDBStore.WithLogger(s.Logger)
 	if s.config.Data.QueryLogEnabled {
 		s.QueryExecutor.WithLogger(s.Logger)
-	} else if s.config.Coordinator.LogQueriesAfter > 0 {
-		// Log long-running queries even if not logging all queries
+	} else if s.config.Coordinator.LogQueriesAfter > 0 || s.config.Coordinator.LogTimedOutQueries {
+		// If we need to do any logging, add a logger.
+		// The TaskManager properly handles both of the above configs
+		// so it only logs as is appropriate.
 		s.QueryExecutor.TaskManager.Logger = s.Logger
 	}
 	s.PointsWriter.WithLogger(s.Logger)
diff --git a/coordinator/config.go b/coordinator/config.go
index 7ab306b0421..b26affbe7c5 100644
--- a/coordinator/config.go
+++ b/coordinator/config.go
@@ -33,6 +33,7 @@ type Config struct {
 	MaxConcurrentQueries int           `toml:"max-concurrent-queries"`
 	QueryTimeout         toml.Duration `toml:"query-timeout"`
 	LogQueriesAfter      toml.Duration `toml:"log-queries-after"`
+	LogTimedOutQueries   bool          `toml:"log-timedout-queries"`
 	MaxSelectPointN      int           `toml:"max-select-point"`
 	MaxSelectSeriesN     int           `toml:"max-select-series"`
 	MaxSelectBucketsN    int           `toml:"max-select-buckets"`
@@ -48,6 +49,7 @@ func NewConfig() Config {
 		MaxSelectPointN:      DefaultMaxSelectPointN,
 		MaxSelectSeriesN:     DefaultMaxSelectSeriesN,
 		TerminationQueryLog:  false,
+		LogTimedOutQueries:   false,
 	}
 }
 
@@ -58,6 +60,7 @@ func (c Config) Diagnostics() (*diagnostics.Diagnostics, error) {
 		"max-concurrent-queries": c.MaxConcurrentQueries,
 		"query-timeout":          c.QueryTimeout,
 		"log-queries-after":      c.LogQueriesAfter,
+		"log-timedout-queries":   c.LogTimedOutQueries,
 		"max-select-point":       c.MaxSelectPointN,
 		"max-select-series":      c.MaxSelectSeriesN,
 		"max-select-buckets":     c.MaxSelectBucketsN,
diff --git a/etc/config.sample.toml b/etc/config.sample.toml
index bba30b1ab23..22c659114de 100644
--- a/etc/config.sample.toml
+++ b/etc/config.sample.toml
@@ -185,6 +185,9 @@
   # discover slow or resource intensive queries.  Setting the value to 0 disables the slow query logging.
   # log-queries-after = "0s"
 
+  # Enables the logging of queries that are killed as a result of exceeding `query-timeout`
+  # log-timedout-queries = false
+
   # The maximum number of points a SELECT can process.  A value of 0 will make
   # the maximum point count unlimited.  This will only be checked every second so queries will not
   # be aborted immediately when hitting the limit.
diff --git a/query/task_manager.go b/query/task_manager.go
index 159da6666f9..a05f3156a40 100644
--- a/query/task_manager.go
+++ b/query/task_manager.go
@@ -72,6 +72,9 @@ type TaskManager struct {
 	// If zero, slow queries will never be logged.
 	LogQueriesAfter time.Duration
 
+	// If true, queries that are killed due to `query-timeout` will be logged.
+	LogTimedoutQueries bool
+
 	// Maximum number of concurrent queries.
 	MaxConcurrentQueries int
 
@@ -315,6 +318,14 @@ func (t *TaskManager) waitForQuery(qid uint64, interrupt <-chan struct{}, closin
 
 		t.queryError(qid, err)
 	case <-timerCh:
+		if t.LogTimedoutQueries {
+			t.Logger.Warn(
+				"query killed for exceeding timeout limit",
+				zap.String("query", t.queries[qid].query),
+				zap.String("database", t.queries[qid].database),
+				zap.String("timeout", prettyTime(t.QueryTimeout).String()),
+			)
+		}
 		t.queryError(qid, ErrQueryTimeoutLimitExceeded)
 	case <-interrupt:
 		// Query was manually closed so exit the select.