Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

Commit

Permalink
cherry pick #1438 to release-2.0 (#1443)
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-srebot authored Feb 20, 2021
1 parent 1235c74 commit b9f9f5c
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 6,261 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ relay_log/*
vendor
*/*.DS_Store
tidb-slow.log
/monitoring/dashboards/dm.json
/monitoring/rules/dm_worker.rules.yml
15 changes: 13 additions & 2 deletions dm/dm-ansible/conf/dm_worker.rules.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
groups:
- name: alert.rules
rules:
- alert: DM_master_all_down
expr: up{job="dm_master"} == 0
labels:
env: ENV_LABELS_ENV
level: critical
expr: up{job="dm_master"} == 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: DM master all down, metrics not relyable

- alert: DM_remain_storage_of_relay_log
expr: dm_relay_space{type="available"} < 10*1024*1024*1024
labels:
Expand Down Expand Up @@ -126,12 +137,12 @@ groups:
summary: dm syncer binlog file not catch up master server exceed 10 min

- alert: DM_binlog_file_gap_between_relay_syncer
expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) dm_syncer_binlog_file{node="syncer"} > 1
expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1
for: 10m
labels:
env: ENV_LABELS_ENV
level: critical
expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) dm_syncer_binlog_file{node="syncer"} > 1
expr: dm_relay_binlog_file{node="relay"} - ON(instance, job) group_right dm_syncer_binlog_file{node="syncer"} > 1
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, task: {{ $labels.task }}, values: {{ $value }}'
value: '{{ $value }}'
Expand Down
13 changes: 1 addition & 12 deletions dm/master/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@ package metrics

import (
"context"
"net/http"
"time"

cpu "github.com/pingcap/tidb-tools/pkg/utils"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"

"github.com/pingcap/dm/pkg/metricsproxy"
)
Expand Down Expand Up @@ -121,23 +119,14 @@ func RunBackgroundJob(ctx context.Context) {

// RegistryMetrics registries metrics for worker
func RegistryMetrics() {
registry := prometheus.NewRegistry()
registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}))
registry.MustRegister(prometheus.NewGoCollector())
registry := prometheus.DefaultRegisterer

registry.MustRegister(workerState)
registry.MustRegister(cpuUsageGauge)
registry.MustRegister(ddlPendingCounter)
registry.MustRegister(ddlErrCounter)
registry.MustRegister(workerEventErrCounter)
registry.MustRegister(startLeaderCounter)

prometheus.DefaultGatherer = registry
}

// GetMetricsHandler returns prometheus HTTP Handler
func GetMetricsHandler() http.Handler {
return promhttp.Handler()
}

// ReportWorkerStage is a setter for workerState
Expand Down
9 changes: 4 additions & 5 deletions dm/master/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,18 +179,17 @@ func (s *Server) Start(ctx context.Context) (err error) {

registerOnce.Do(metrics.RegistryMetrics)

// HTTP handlers on etcd's client IP:port
// HTTP handlers on etcd's client IP:port. etcd will add a builtin `/metrics` route
// NOTE: after received any HTTP request from chrome browser,
// the server may be blocked when closing sometime.
// And any request to etcd's builtin handler has the same problem.
// And curl or safari browser does trigger this problem.
// But I haven't figured it out.
// (maybe more requests are sent from chrome or its extensions).
userHandles := map[string]http.Handler{
"/apis/": apiHandler,
"/status": getStatusHandle(),
"/debug/": getDebugHandler(),
"/metrics": metrics.GetMetricsHandler(),
"/apis/": apiHandler,
"/status": getStatusHandle(),
"/debug/": getDebugHandler(),
}

// gRPC API server
Expand Down
Loading

0 comments on commit b9f9f5c

Please sign in to comment.