diff --git a/cdc/kv/metrics.go b/cdc/kv/metrics.go index b8d9067d136..f2f2c022553 100644 --- a/cdc/kv/metrics.go +++ b/cdc/kv/metrics.go @@ -158,6 +158,13 @@ var ( Name: "region_worker_channel_size", Help: "size of each channel in region worker", }, []string{"namespace", "changefeed", "table", "store", "type"}) + slowInitializeRegion = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "ticdc", + Subsystem: "kvclient", + Name: "slow_initialize_region_count", + Help: "the number of slow initialize region", + }, []string{"namespace", "changefeed"}) ) // GetGlobalGrpcMetrics gets the global grpc metrics. @@ -184,6 +191,7 @@ func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(regionWorkerQueueDuration) registry.MustRegister(workerBusyRatio) registry.MustRegister(workerChannelSize) + registry.MustRegister(slowInitializeRegion) // Register client metrics to registry. registry.MustRegister(grpcMetrics) diff --git a/cdc/kv/shared_client.go b/cdc/kv/shared_client.go index 308c5d00845..4ee7a92c074 100644 --- a/cdc/kv/shared_client.go +++ b/cdc/kv/shared_client.go @@ -723,6 +723,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { currTime := s.pdClock.CurrentTime() s.totalSpans.RLock() + slowInitializeRegion := 0 for subscriptionID, rt := range s.totalSpans.v { attr := rt.rangeLock.CollectLockedRangeAttrs(nil) ckptTime := oracle.GetTimeFromTS(attr.SlowestRegion.ResolvedTs) @@ -735,6 +736,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { zap.Any("slowRegion", attr.SlowestRegion)) } } else if currTime.Sub(attr.SlowestRegion.Created) > 10*time.Minute { + slowInitializeRegion += 1 log.Info("event feed initializes a region too slow", zap.String("namespace", s.changefeed.Namespace), zap.String("changefeed", s.changefeed.ID), @@ -756,6 +758,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error { } } s.totalSpans.RUnlock() + s.metrics.slowInitializeRegion.Set(float64(slowInitializeRegion)) } } @@ -810,6 +813,7 @@ type sharedClientMetrics struct { batchResolvedSize prometheus.Observer lockResolveWaitDuration prometheus.Observer lockResolveRunDuration prometheus.Observer + slowInitializeRegion prometheus.Gauge } func (s *SharedClient) initMetrics() { @@ -829,6 +833,9 @@ func (s *SharedClient) initMetrics() { s.metrics.batchResolvedSize = batchResolvedEventSize. WithLabelValues(s.changefeed.Namespace, s.changefeed.ID) + + s.metrics.slowInitializeRegion = slowInitializeRegion. + WithLabelValues(s.changefeed.Namespace, s.changefeed.ID) } func (s *SharedClient) clearMetrics() { diff --git a/metrics/grafana/ticdc.json b/metrics/grafana/ticdc.json index 48eacb2da5e..472e17015f8 100644 --- a/metrics/grafana/ticdc.json +++ b/metrics/grafana/ticdc.json @@ -15197,6 +15197,106 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "dashLength": 10, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 122 + }, + "id": 10039, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "pluginVersion": "7.5.17", + "pointradius": 2, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "targets": [ + { + "expr": "ticdc_kvclient_slow_initialize_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$ticdc_instance\"}", + "legendFormat": "{{instance}}-{{changefeed}}-{{namespace}}", + "interval": "", + "exemplar": true, + "hide": false, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "KV client slow initalize region count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true, + "$$hashKey": "object:432" + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false, + "$$hashKey": "object:433" + } + ], + "yaxis": { + "align": false, + "alignLevel": null + }, + "description": "The count of regions that initialize slow. You can search the log [event feed initializes a region too slow] to get slow region id", + "bars": false, + "dashes": false, + "fillGradient": 0, + "hiddenSeries": false, + "percentage": false, + "points": false, + "stack": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null } ], "title": "KVClient",