Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: add connection and fail metrics by resource group name #49424

Merged
merged 11 commits into from
Jan 3, 2024
14 changes: 7 additions & 7 deletions pkg/metrics/grafana/tidb_resource_control.json
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@
"targets": [
{
"exemplar": true,
"expr": "(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name)) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name)) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
nolouch marked this conversation as resolved.
Show resolved Hide resolved
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down Expand Up @@ -459,7 +459,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down Expand Up @@ -691,7 +691,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (resource_group)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down Expand Up @@ -1043,7 +1043,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"read\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"read\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand All @@ -1053,7 +1053,7 @@
},
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"write\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_request_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=\"write\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
"format": "time_series",
"hide": false,
"interval": "",
Expand Down Expand Up @@ -1297,7 +1297,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down Expand Up @@ -1530,7 +1530,7 @@
"targets": [
{
"exemplar": true,
"expr": "sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name)",
"expr": "sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (resource_group)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down
8 changes: 4 additions & 4 deletions pkg/metrics/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ var (
QueryDurationHistogram *prometheus.HistogramVec
QueryTotalCounter *prometheus.CounterVec
AffectedRowsCounter *prometheus.CounterVec
ConnGauge prometheus.Gauge
ConnGauge *prometheus.GaugeVec
DisconnectionCounter *prometheus.CounterVec
PreparedStmtGauge prometheus.Gauge
ExecuteErrorCounter *prometheus.CounterVec
Expand Down Expand Up @@ -107,13 +107,13 @@ func InitServerMetrics() {
Help: "Counters of server affected rows.",
}, []string{LblSQLType})

ConnGauge = NewGauge(
ConnGauge = NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "tidb",
Subsystem: "server",
Name: "connections",
Help: "Number of connections.",
})
}, []string{LblResourceGroup})

DisconnectionCounter = NewCounterVec(
prometheus.CounterOpts{
Expand All @@ -136,7 +136,7 @@ func InitServerMetrics() {
Subsystem: "server",
Name: "execute_error_total",
Help: "Counter of execute errors.",
}, []string{LblType, LblDb})
}, []string{LblType, LblDb, LblResourceGroup})

CriticalErrorCounter = NewCounter(
prometheus.CounterOpts{
Expand Down
2 changes: 1 addition & 1 deletion pkg/metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ func InitSessionMetrics() {
Subsystem: "session",
Name: "resource_group_query_total",
Help: "Counter of the total number of queries for the resource group",
}, []string{LblName})
}, []string{LblResourceGroup})

FairLockingUsageCount = NewCounterVec(
prometheus.CounterOpts{
Expand Down
27 changes: 21 additions & 6 deletions pkg/server/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,17 +358,24 @@ func (cc *clientConn) handshake(ctx context.Context) error {
func (cc *clientConn) Close() error {
cc.server.rwlock.Lock()
delete(cc.server.clients, cc.connectionID)
connections := len(cc.server.clients)
connections := make(map[string]int, 0)
for _, conn := range cc.server.clients {
resourceGroup := conn.getCtx().GetSessionVars().ResourceGroupName
connections[resourceGroup]++
}
cc.server.rwlock.Unlock()
return closeConn(cc, connections)
}

// closeConn is idempotent and thread-safe.
// It will be called on the same `clientConn` more than once to avoid connection leak.
func closeConn(cc *clientConn, connections int) error {
func closeConn(cc *clientConn, connections map[string]int) error {
var err error
cc.closeOnce.Do(func() {
metrics.ConnGauge.Set(float64(connections))
for name, count := range connections {
bufferflies marked this conversation as resolved.
Show resolved Hide resolved
metrics.ConnGauge.WithLabelValues(name).Set(float64(count))
}

if cc.connectionID > 0 {
cc.server.dom.ReleaseConnID(cc.connectionID)
cc.connectionID = 0
Expand All @@ -392,7 +399,13 @@ func closeConn(cc *clientConn, connections int) error {

func (cc *clientConn) closeWithoutLock() error {
delete(cc.server.clients, cc.connectionID)
return closeConn(cc, len(cc.server.clients))
connections := make(map[string]int, 0)
for _, conn := range cc.server.clients {
resourceGroup := conn.getCtx().GetSessionVars().ResourceGroupName
connections[resourceGroup]++
}

return closeConn(cc, connections)
}

// writeInitialHandshake sends server version, connection ID, server capability, collation, server status
Expand Down Expand Up @@ -1119,9 +1132,11 @@ func (cc *clientConn) Run(ctx context.Context) {
if ctx := cc.getCtx(); ctx != nil {
txnMode = ctx.GetSessionVars().GetReadableTxnMode()
}
for _, dbName := range session.GetDBNames(cc.getCtx().GetSessionVars()) {
metrics.ExecuteErrorCounter.WithLabelValues(metrics.ExecuteErrorToLabel(err), dbName).Inc()
vars := cc.getCtx().GetSessionVars()
for _, dbName := range session.GetDBNames(vars) {
metrics.ExecuteErrorCounter.WithLabelValues(metrics.ExecuteErrorToLabel(err), dbName, vars.ResourceGroupName).Inc()
}

if storeerr.ErrLockAcquireFailAndNoWaitSet.Equal(err) {
logutil.Logger(ctx).Debug("Expected error for FOR UPDATE NOWAIT", zap.Error(err))
} else {
Expand Down
2 changes: 1 addition & 1 deletion pkg/server/conn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2065,7 +2065,7 @@ func TestCloseConn(t *testing.T) {
for i := 0; i < numGoroutines; i++ {
go func() {
defer wg.Done()
err := closeConn(cc, 1)
err := closeConn(cc, map[string]int{"": 1})
require.NoError(t, err)
}()
}
Expand Down
19 changes: 16 additions & 3 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,13 @@ func (s *Server) Close() {
func (s *Server) registerConn(conn *clientConn) bool {
s.rwlock.Lock()
defer s.rwlock.Unlock()
connections := len(s.clients)
// connections := len(s.clients)

connections := make(map[string]int, 0)
bufferflies marked this conversation as resolved.
Show resolved Hide resolved
for _, conn := range s.clients {
resourceGroup := conn.getCtx().GetSessionVars().ResourceGroupName
connections[resourceGroup]++
}

logger := logutil.BgLogger()
if s.inShutdownMode.Load() {
Expand All @@ -603,8 +609,15 @@ func (s *Server) registerConn(conn *clientConn) bool {
return false
}
s.clients[conn.connectionID] = conn
connections = len(s.clients)
metrics.ConnGauge.Set(float64(connections))
connectionMap := make(map[string]int, 0)
for _, conn := range s.clients {
resourceGroup := conn.getCtx().GetSessionVars().ResourceGroupName
connectionMap[resourceGroup]++
}

for name, count := range connectionMap {
metrics.ConnGauge.WithLabelValues(name).Set(float64(count))
}
return true
}

Expand Down