Skip to content

Commit

Permalink
ticdc: Add metrics to observe slow initialize region count (#10863)
Browse files Browse the repository at this point in the history
ref #10862
  • Loading branch information
hongyunyan authored Mar 29, 2024
1 parent e9ce067 commit db60e9e
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 0 deletions.
8 changes: 8 additions & 0 deletions cdc/kv/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ var (
Name: "region_worker_channel_size",
Help: "size of each channel in region worker",
}, []string{"namespace", "changefeed", "table", "store", "type"})
slowInitializeRegion = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "ticdc",
Subsystem: "kvclient",
Name: "slow_initialize_region_count",
Help: "the number of slow initialize region",
}, []string{"namespace", "changefeed"})
)

// GetGlobalGrpcMetrics gets the global grpc metrics.
Expand All @@ -184,6 +191,7 @@ func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(regionWorkerQueueDuration)
registry.MustRegister(workerBusyRatio)
registry.MustRegister(workerChannelSize)
registry.MustRegister(slowInitializeRegion)

// Register client metrics to registry.
registry.MustRegister(grpcMetrics)
Expand Down
7 changes: 7 additions & 0 deletions cdc/kv/shared_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error {

currTime := s.pdClock.CurrentTime()
s.totalSpans.RLock()
slowInitializeRegion := 0
for subscriptionID, rt := range s.totalSpans.v {
attr := rt.rangeLock.CollectLockedRangeAttrs(nil)
ckptTime := oracle.GetTimeFromTS(attr.SlowestRegion.ResolvedTs)
Expand All @@ -735,6 +736,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error {
zap.Any("slowRegion", attr.SlowestRegion))
}
} else if currTime.Sub(attr.SlowestRegion.Created) > 10*time.Minute {
slowInitializeRegion += 1
log.Info("event feed initializes a region too slow",
zap.String("namespace", s.changefeed.Namespace),
zap.String("changefeed", s.changefeed.ID),
Expand All @@ -756,6 +758,7 @@ func (s *SharedClient) logSlowRegions(ctx context.Context) error {
}
}
s.totalSpans.RUnlock()
s.metrics.slowInitializeRegion.Set(float64(slowInitializeRegion))
}
}

Expand Down Expand Up @@ -810,6 +813,7 @@ type sharedClientMetrics struct {
batchResolvedSize prometheus.Observer
lockResolveWaitDuration prometheus.Observer
lockResolveRunDuration prometheus.Observer
slowInitializeRegion prometheus.Gauge
}

func (s *SharedClient) initMetrics() {
Expand All @@ -829,6 +833,9 @@ func (s *SharedClient) initMetrics() {

s.metrics.batchResolvedSize = batchResolvedEventSize.
WithLabelValues(s.changefeed.Namespace, s.changefeed.ID)

s.metrics.slowInitializeRegion = slowInitializeRegion.
WithLabelValues(s.changefeed.Namespace, s.changefeed.ID)
}

func (s *SharedClient) clearMetrics() {
Expand Down
100 changes: 100 additions & 0 deletions metrics/grafana/ticdc.json
Original file line number Diff line number Diff line change
Expand Up @@ -15197,6 +15197,106 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"dashLength": 10,
"datasource": "${DS_TEST-CLUSTER}",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 122
},
"id": 10039,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"pluginVersion": "7.5.17",
"pointradius": 2,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"targets": [
{
"expr": "ticdc_kvclient_slow_initialize_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$ticdc_instance\"}",
"legendFormat": "{{instance}}-{{changefeed}}-{{namespace}}",
"interval": "",
"exemplar": true,
"hide": false,
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "KV client slow initalize region count",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true,
"$$hashKey": "object:432"
},
{
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false,
"$$hashKey": "object:433"
}
],
"yaxis": {
"align": false,
"alignLevel": null
},
"description": "The count of regions that initialize slow. You can search the log [event feed initializes a region too slow] to get slow region id",
"bars": false,
"dashes": false,
"fillGradient": 0,
"hiddenSeries": false,
"percentage": false,
"points": false,
"stack": false,
"steppedLine": false,
"timeFrom": null,
"timeShift": null
}
],
"title": "KVClient",
Expand Down

0 comments on commit db60e9e

Please sign in to comment.