From 559f20545e3c7a9ebaead4e873c3f03f868de118 Mon Sep 17 00:00:00 2001
From: Azhng <archer.xn@gmail.com>
Date: Thu, 29 Apr 2021 16:44:50 -0400
Subject: [PATCH] sql: exclude Bulk IO operations from service latency metrics

Previously, we included Bulk IO operations such as BACKUP
in our service latency metrics. This caused spikes in our time
series graph for service latency, which can lead to confusion.

Closes #42174

Release note (sql change): Bulk IO operations will no longer be included
 in service latency metrics.
---
 pkg/sql/executor_statement_metrics.go | 38 ++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go
index 481deb9b2db5..1f99d1600cb9 100644
--- a/pkg/sql/executor_statement_metrics.go
+++ b/pkg/sql/executor_statement_metrics.go
@@ -184,6 +184,7 @@ func (ex *connExecutor) recordStatementSummary(
 	execOverhead := svcLat - processingLat
 
 	stmt := &planner.stmt
+	shouldIncludeInLatencyMetrics := shouldIncludeStmtInLatencyMetrics(stmt)
 	flags := planner.curPlan.flags
 	if automaticRetryCount == 0 {
 		ex.updateOptCounters(flags)
@@ -192,11 +193,15 @@ func (ex *connExecutor) recordStatementSummary(
 			if _, ok := stmt.AST.(*tree.Select); ok {
 				m.DistSQLSelectCount.Inc(1)
 			}
-			m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds())
-			m.DistSQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds())
+			if shouldIncludeInLatencyMetrics {
+				m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds())
+				m.DistSQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds())
+			}
+		}
+		if shouldIncludeInLatencyMetrics {
+			m.SQLExecLatency.RecordValue(runLatRaw.Nanoseconds())
+			m.SQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds())
 		}
-		m.SQLExecLatency.RecordValue(runLatRaw.Nanoseconds())
-		m.SQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds())
 	}
 
 	stmtID := ex.statsCollector.recordStatement(
@@ -257,3 +262,28 @@ func (ex *connExecutor) updateOptCounters(planFlags planFlags) {
 		m.SQLOptPlanCacheMisses.Inc(1)
 	}
 }
+
+// Bulk IO operations cause spikes in our time series chart for service latency. We exclude them from the service
+// latency metrics to avoid confusions.
+func shouldIncludeStmtInLatencyMetrics(stmt *Statement) bool {
+	switch stmt.AST.(type) {
+	case *tree.Backup:
+		return false
+	case *tree.ShowBackup:
+		return false
+	case *tree.Restore:
+		return false
+	case *tree.Import:
+		return false
+	case *tree.Export:
+		return false
+	case *tree.ScheduledBackup:
+		return false
+	case *tree.StreamIngestion:
+		return false
+	case *tree.ReplicationStream:
+		return false
+	}
+
+	return true
+}