From 1748fe3eda47fb2809a9a5836186ffe7890c2316 Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Wed, 14 Jun 2017 04:00:44 -0700 Subject: [PATCH] Documentation/op-guide: fix failed RPC rate, leader election metrics This fixes failed RPC rate query, where we do not need subtraction because we already query by the status code. Also adds grpc_method to make it more specific. Most of the time, the failure recovers within 10-second, which is our Prometheus scrap interval, so 'rate' query might not cover that time window, showing as 0s, but still shows up in the graph. Signed-off-by: Gyu-Ho Lee --- Documentation/op-guide/grafana.json | 39 ++++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/Documentation/op-guide/grafana.json b/Documentation/op-guide/grafana.json index 30baa0d8b87..c5493253654 100644 --- a/Documentation/op-guide/grafana.json +++ b/Documentation/op-guide/grafana.json @@ -114,18 +114,21 @@ "span": 5, "stack": false, "steppedLine": false, - "targets": [{ - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"} [1m]))", + "targets": [ + { + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} RPC Rate", + "legendFormat": "RPC Rate", "metric": "grpc_server_started_total", "refId": "A", "step": 2 }, { - "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))", + "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", + "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}} RPC Failed Rate", + "legendFormat": "RPC Failed Rate", "metric": "grpc_server_handled_total", "refId": "B", "step": 2 @@ -361,7 +364,7 @@ "stack": false, "steppedLine": true, "targets": [{ - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket [5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} WAL fsync", @@ -370,7 +373,7 @@ "step": 4 }, { - "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket [5m])) by (instance, le))", + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", "intervalFactor": 2, "legendFormat": "{{instance}} DB fsync", "metric": "etcd_disk_backend_commit_duration_seconds_bucket", @@ -522,7 +525,7 @@ "stack": true, "steppedLine": false, "targets": [{ - "expr": "rate(etcd_network_client_grpc_received_bytes_total [1m])", + "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", "intervalFactor": 2, "legendFormat": "{{instance}} Client Traffic In", "metric": "etcd_network_client_grpc_received_bytes_total", @@ -595,7 +598,7 @@ "stack": true, "steppedLine": false, "targets": [{ - "expr": "rate(etcd_network_client_grpc_sent_bytes_total [1m])", + "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", "intervalFactor": 2, "legendFormat": "{{instance}} Client Traffic Out", "metric": "etcd_network_client_grpc_sent_bytes_total", @@ -668,7 +671,7 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "sum(rate(etcd_network_peer_received_bytes_total [1m])) by (instance)", + "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}} Peer Traffic In", "metric": "etcd_network_peer_received_bytes_total", @@ -742,7 +745,7 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "sum(rate(etcd_network_peer_sent_bytes_total [1m])) by (instance)", + "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", "hide": false, "interval": "", "intervalFactor": 2, @@ -822,7 +825,7 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "sum(rate(etcd_server_proposals_failed_total [1m]))", + "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", "intervalFactor": 2, "legendFormat": "Proposal Failure Rate", "metric": "etcd_server_proposals_failed_total", @@ -838,7 +841,7 @@ "step": 2 }, { - "expr": "sum(rate(etcd_server_proposals_committed_total [1m]))", + "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", "intervalFactor": 2, "legendFormat": "Proposal Commit Rate", "metric": "etcd_server_proposals_committed_total", @@ -846,7 +849,7 @@ "step": 2 }, { - "expr": "sum(rate(etcd_server_proposals_applied_total [1m]))", + "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", "intervalFactor": 2, "legendFormat": "Proposal Apply Rate", "refId": "D", @@ -922,9 +925,9 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "etcd_server_leader_changes_seen_total", + "expr": "changes(etcd_server_leader_changes_seen_total[1d])", "intervalFactor": 2, - "legendFormat": "{{instance}} Leader Change Seen", + "legendFormat": "{{instance}} Total Leader Elections Per Day", "metric": "etcd_server_leader_changes_seen_total", "refId": "A", "step": 2 @@ -932,7 +935,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Rate Leader Elections", + "title": "Total Leader Elections Per Day", "tooltip": { "msResolution": false, "shared": true, @@ -1009,4 +1012,4 @@ "version": 215, "links": [], "gnetId": null -} \ No newline at end of file +}