Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

Commit

Permalink
*: add panel for start leader and heartbeat update error (#853)
Browse files Browse the repository at this point in the history
  • Loading branch information
lance6716 authored Aug 6, 2020
1 parent d71b25d commit 0675980
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 38 deletions.
35 changes: 35 additions & 0 deletions dm/dm-ansible/conf/dm_worker.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,38 @@ groups:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: dm syncer binlog file not catch up relay exceed 10 min

- alert: DM_worker_offline
expr: dm_master_worker_state == 0
for: 1h
labels:
env: ENV_LABELS_ENV
level: critical
expr: dm_master_worker_state == 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: dm worker offline exceed 1h

- alert: DM_pending_DDL
expr: dm_master_ddl_state_number > 0
for: 1h
labels:
env: ENV_LABELS_ENV
level: critical
expr: dm_master_ddl_state_number > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: DDL pending exceed 1h

- alert: DM_DDL_error
expr: increase(dm_master_shard_ddl_error[1m]) > 0
labels:
env: ENV_LABELS_ENV
level: critical
expr: increase(dm_master_shard_ddl_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: DDL error happens
218 changes: 189 additions & 29 deletions dm/dm-ansible/scripts/dm.json
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -848,8 +846,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -935,8 +931,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1022,8 +1016,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1109,8 +1101,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1196,8 +1186,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1283,8 +1271,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1370,8 +1356,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1519,7 +1503,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 31
"y": 100
},
"id": 55,
"panels": [
Expand Down Expand Up @@ -2511,7 +2495,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"x": 12,
"y": 17
},
"id": 25,
Expand Down Expand Up @@ -5343,10 +5327,98 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "Number of error happens when update heartbeat",
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 47
},
"id": 90,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_syncer_heartbeat_update_error",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "heartbeat update error",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": "",
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
"title": "binlog replication",
"title": "binlog replication",
"type": "row"
},
{
Expand All @@ -5370,7 +5442,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"x": 6,
"y": 48
},
"id": 83,
Expand Down Expand Up @@ -5458,7 +5530,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"x": 12,
"y": 48
},
"id": 81,
Expand Down Expand Up @@ -5545,9 +5617,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 48
"w": 12,
"x": 0,
"y": 55
},
"id": 82,
"legend": {
Expand Down Expand Up @@ -5633,9 +5705,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 48
"w": 12,
"x": 12,
"y": 55
},
"id": 84,
"legend": {
Expand Down Expand Up @@ -5722,8 +5794,8 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 55
"x": 18,
"y": 48
},
"id": 85,
"legend": {
Expand Down Expand Up @@ -5798,6 +5870,94 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "number of dm-masters try to start leader components per minute",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 48
},
"id": 86,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_master_start_leader_counter",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "number of dm-masters start leader components per minute",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down
2 changes: 2 additions & 0 deletions dm/master/election.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ func (s *Server) isLeaderAndNeedForward() (isLeader bool, needForward bool) {
}

func (s *Server) startLeaderComponent(ctx context.Context) bool {
metrics.ReportStartLeader()

// try to upgrade the cluster version if a member become the leader.
// so if the old leader failed when upgrading, the new leader can try again.
// NOTE: if the cluster has been upgraded, calling this method again should have no side effects.
Expand Down
Loading

0 comments on commit 0675980

Please sign in to comment.