Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

*: add panel for start leader and heartbeat update error #853

Merged
merged 9 commits into from
Aug 6, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 189 additions & 29 deletions dm/dm-ansible/scripts/dm.json
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -848,8 +846,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -935,8 +931,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1022,8 +1016,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1109,8 +1101,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1196,8 +1186,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1283,8 +1271,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1370,8 +1356,6 @@
"format": "short",
"label": null,
"logBase": 1,
"max": "5",
"min": "0",
"show": true
},
{
Expand Down Expand Up @@ -1519,7 +1503,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 31
"y": 100
},
"id": 55,
"panels": [
Expand Down Expand Up @@ -2511,7 +2495,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"x": 12,
"y": 17
},
"id": 25,
Expand Down Expand Up @@ -5343,10 +5327,98 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "Number of error happens when update heartbeat",
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 47
},
"id": 90,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_syncer_heartbeat_update_error",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "heartbeat update error",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": "",
"logBase": 1,
"max": "1",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
"title": "binlog replication",
"title": "binlog replication",
"type": "row"
},
{
Expand All @@ -5370,7 +5442,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"x": 6,
"y": 48
},
"id": 83,
Expand Down Expand Up @@ -5458,7 +5530,7 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"x": 12,
"y": 48
},
"id": 81,
Expand Down Expand Up @@ -5545,9 +5617,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 48
"w": 12,
"x": 0,
"y": 55
},
"id": 82,
"legend": {
Expand Down Expand Up @@ -5633,9 +5705,9 @@
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 48
"w": 12,
"x": 12,
"y": 55
},
"id": 84,
"legend": {
Expand Down Expand Up @@ -5722,8 +5794,8 @@
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 55
"x": 18,
"y": 48
},
"id": 85,
"legend": {
Expand Down Expand Up @@ -5798,6 +5870,94 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "number of dm-masters try to start leader components per minute",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 48
},
"id": 86,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "dm_master_start_leader_counter",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "number of dm-masters start leader components per minute",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down
2 changes: 2 additions & 0 deletions dm/master/election.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ func (s *Server) isLeaderAndNeedForward() (isLeader bool, needForward bool) {
}

func (s *Server) startLeaderComponent(ctx context.Context) bool {
metrics.ReportStartLeader()

// try to upgrade the cluster version if a member become the leader.
// so if the old leader failed when upgrading, the new leader can try again.
// NOTE: if the cluster has been upgraded, calling this method again should have no side effects.
Expand Down
14 changes: 14 additions & 0 deletions dm/master/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ var (
Name: "worker_event_error",
Help: "number of error related to worker event, during handling or watching",
}, []string{"type"})

startLeaderCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "dm",
Subsystem: "master",
Name: "start_leader_counter",
Help: "number of this dm-master try to start leader components",
})
)

func collectMetrics() {
Expand Down Expand Up @@ -122,6 +130,7 @@ func RegistryMetrics() {
registry.MustRegister(ddlPendingCounter)
registry.MustRegister(ddlErrCounter)
registry.MustRegister(workerEventErrCounter)
registry.MustRegister(startLeaderCounter)

prometheus.DefaultGatherer = registry
}
Expand Down Expand Up @@ -161,6 +170,11 @@ func ReportWorkerEventErr(errType string) {
workerEventErrCounter.WithLabelValues(errType).Inc()
}

// ReportStartLeader increases startLeaderCounter by one
func ReportStartLeader() {
startLeaderCounter.Inc()
}

// OnRetireLeader cleans some metrics when retires
func OnRetireLeader() {
workerState.Reset()
Expand Down
Loading