Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

*: add panel for start leader and heartbeat update error #853

Merged
merged 9 commits into from
Aug 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions dm/dm-ansible/conf/dm_worker.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,38 @@ groups:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: dm syncer binlog file not catch up relay exceed 10 min

- alert: DM_worker_offline
expr: dm_master_worker_state == 0
for: 1h
labels:
env: ENV_LABELS_ENV
level: critical
expr: dm_master_worker_state == 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: dm worker offline exceed 1h

- alert: DM_pending_DDL
expr: dm_master_ddl_state_number > 0
for: 1h
labels:
env: ENV_LABELS_ENV
level: critical
expr: dm_master_ddl_state_number > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: DDL pending exceed 1h

- alert: DM_DDL_error
expr: increase(dm_master_shard_ddl_error[1m]) > 0
labels:
env: ENV_LABELS_ENV
level: critical
expr: increase(dm_master_shard_ddl_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
summary: DDL error happens
218 changes: 189 additions & 29 deletions dm/dm-ansible/scripts/dm.json
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -848,8 +846,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -935,8 +931,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1022,8 +1016,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1109,8 +1101,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1196,8 +1186,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1283,8 +1271,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1370,8 +1356,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1519,7 +1503,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 31
"y": 100
},
"id": 55,
"panels": [
Expand Down Expand Up @@ -2511,7 +2495,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"x": 12,
"y": 17
},
"id": 25,
Expand Down Expand Up @@ -5343,10 +5327,98 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "Number of error happens when update heartbeat",
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 47
},
"id": 90,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_syncer_heartbeat_update_error",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "heartbeat update error",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": "",
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
"title": "binlog replication",
"title": "binlog replication",
"type": "row"
},
{
Expand All @@ -5370,7 +5442,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"x": 6,
"y": 48
},
"id": 83,
Expand Down Expand Up @@ -5458,7 +5530,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"x": 12,
"y": 48
},
"id": 81,
Expand Down Expand Up @@ -5545,9 +5617,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 48
"w": 12,
"x": 0,
"y": 55
},
"id": 82,
"legend": {
Expand Down Expand Up @@ -5633,9 +5705,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 48
"w": 12,
"x": 12,
"y": 55
},
"id": 84,
"legend": {
Expand Down Expand Up @@ -5722,8 +5794,8 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 55
"x": 18,
"y": 48
},
"id": 85,
"legend": {
Expand Down Expand Up @@ -5798,6 +5870,94 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "number of dm-masters try to start leader components per minute",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 48
},
"id": 86,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_master_start_leader_counter",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "number of dm-masters start leader components per minute",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down
2 changes: 2 additions & 0 deletions dm/master/election.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ func (s *Server) isLeaderAndNeedForward() (isLeader bool, needForward bool) {
}

func (s *Server) startLeaderComponent(ctx context.Context) bool {
metrics.ReportStartLeader()

// try to upgrade the cluster version if a member become the leader.
// so if the old leader failed when upgrading, the new leader can try again.
// NOTE: if the cluster has been upgraded, calling this method again should have no side effects.
Expand Down
Loading