From 559f20545e3c7a9ebaead4e873c3f03f868de118 Mon Sep 17 00:00:00 2001 From: Azhng Date: Thu, 29 Apr 2021 16:44:50 -0400 Subject: [PATCH] sql: exclude Bulk IO operations from service latency metrics Previously, we included Bulk IO operations such as BACKUP in our service latency metrics. This caused spikes in our time series graph for service latency, which can lead to confusion. Closes #42174 Release note (sql change): Bulk IO operations will no longer be included in service latency metrics. --- pkg/sql/executor_statement_metrics.go | 38 ++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index 481deb9b2db5..1f99d1600cb9 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -184,6 +184,7 @@ func (ex *connExecutor) recordStatementSummary( execOverhead := svcLat - processingLat stmt := &planner.stmt + shouldIncludeInLatencyMetrics := shouldIncludeStmtInLatencyMetrics(stmt) flags := planner.curPlan.flags if automaticRetryCount == 0 { ex.updateOptCounters(flags) @@ -192,11 +193,15 @@ func (ex *connExecutor) recordStatementSummary( if _, ok := stmt.AST.(*tree.Select); ok { m.DistSQLSelectCount.Inc(1) } - m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds()) - m.DistSQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds()) + if shouldIncludeInLatencyMetrics { + m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds()) + m.DistSQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds()) + } + } + if shouldIncludeInLatencyMetrics { + m.SQLExecLatency.RecordValue(runLatRaw.Nanoseconds()) + m.SQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds()) } - m.SQLExecLatency.RecordValue(runLatRaw.Nanoseconds()) - m.SQLServiceLatency.RecordValue(svcLatRaw.Nanoseconds()) } stmtID := ex.statsCollector.recordStatement( @@ -257,3 +262,28 @@ func (ex *connExecutor) updateOptCounters(planFlags planFlags) { m.SQLOptPlanCacheMisses.Inc(1) } } + +// Bulk IO operations cause spikes in our time series chart for service latency. We exclude them from the service +// latency metrics to avoid confusions. +func shouldIncludeStmtInLatencyMetrics(stmt *Statement) bool { + switch stmt.AST.(type) { + case *tree.Backup: + return false + case *tree.ShowBackup: + return false + case *tree.Restore: + return false + case *tree.Import: + return false + case *tree.Export: + return false + case *tree.ScheduledBackup: + return false + case *tree.StreamIngestion: + return false + case *tree.ReplicationStream: + return false + } + + return true +}