diff --git a/CHANGELOG.md b/CHANGELOG.md index d1f6fa99df..4a8284bc7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001 - [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat. - [#6231](https://github.com/thanos-io/thanos/pull/6231) mixins: Add code/grpc-code dimension to error widgets. +- [#6244](https://github.com/thanos-io/thanos/pull/6244) mixin(Rule): Add rule evaluation failures to the Rule dashboard. ### Removed diff --git a/examples/dashboards/rule.json b/examples/dashboards/rule.json index cb1250f7cc..7f2092da8b 100644 --- a/examples/dashboards/rule.json +++ b/examples/dashboards/rule.json @@ -40,15 +40,15 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (job, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))", + "expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ strategy }}", + "legendFormat": "{{ rule_group }} {{ strategy }}", "legendLink": null, "step": 10 } @@ -116,15 +116,15 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 3, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (job, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))", + "expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluation_failures_total{job=~\"$job\"}[$__rate_interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ strategy }}", + "legendFormat": "{{ rule_group }} {{ strategy }}", "legendLink": null, "step": 10 } @@ -132,7 +132,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Rule Group Evaluations Missed", + "title": "Rule Group Evaluations Failed", "tooltip": { "shared": false, "sort": 0, @@ -192,7 +192,83 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, rule_group, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ rule_group }} {{ strategy }}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Group Evaluations Missed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, "stack": false, "steppedLine": false, "targets": [ @@ -208,7 +284,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Rule Group Evlauations Too Slow", + "title": "Rule Group Evaluations Too Slow", "tooltip": { "shared": false, "sort": 0, @@ -261,7 +337,7 @@ "datasource": "$datasource", "description": "Shows rate of dropped alerts.", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -338,7 +414,7 @@ "datasource": "$datasource", "description": "Shows rate of alerts that successfully sent to alert manager.", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -417,7 +493,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of sent alerts.", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -493,7 +569,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to send alerts to alert manager.", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -623,7 +699,7 @@ "datasource": "$datasource", "description": "Shows rate of queued alerts.", "fill": 1, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -702,7 +778,7 @@ "datasource": "$datasource", "description": "Shows ratio of dropped alerts compared to the total number of queued alerts.", "fill": 10, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -790,7 +866,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests.", "fill": 10, - "id": 10, + "id": 11, "legend": { "avg": false, "current": false, @@ -941,7 +1017,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 11, + "id": 12, "legend": { "avg": false, "current": false, @@ -1016,7 +1092,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles.", "fill": 1, - "id": 12, + "id": 13, "legend": { "avg": false, "current": false, @@ -1146,7 +1222,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests.", "fill": 10, - "id": 13, + "id": 14, "legend": { "avg": false, "current": false, @@ -1297,7 +1373,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests.", "fill": 10, - "id": 14, + "id": 15, "legend": { "avg": false, "current": false, @@ -1372,7 +1448,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests, in quantiles", "fill": 1, - "id": 15, + "id": 16, "legend": { "avg": false, "current": false, @@ -1501,7 +1577,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 16, + "id": 17, "legend": { "avg": false, "current": false, @@ -1617,7 +1693,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 17, + "id": 18, "legend": { "avg": false, "current": false, @@ -1693,7 +1769,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 19, "legend": { "avg": false, "current": false, diff --git a/mixin/dashboards/rule.libsonnet b/mixin/dashboards/rule.libsonnet index 411544d10a..92fa7635ac 100644 --- a/mixin/dashboards/rule.libsonnet +++ b/mixin/dashboards/rule.libsonnet @@ -9,6 +9,7 @@ local utils = import '../lib/utils.libsonnet'; dashboard:: { selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + ruleGroupDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'rule_group', 'strategy']), }, }, grafanaDashboards+:: { @@ -22,19 +23,26 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Rule Group Evaluations') + g.queryPanel( - 'sum by (%s) (rate(prometheus_rule_evaluations_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector], - '{{ strategy }}', + 'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, + '{{ rule_group }} {{ strategy }}', + ) + ) + .addPanel( + g.panel('Rule Group Evaluations Failed') + + g.queryPanel( + 'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, + '{{ rule_group }} {{ strategy }}', ) ) .addPanel( g.panel('Rule Group Evaluations Missed') + g.queryPanel( - 'sum by (%s) (increase(prometheus_rule_group_iterations_missed_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector], - '{{ strategy }}', + 'sum by (%(ruleGroupDimensions)s) (increase(prometheus_rule_group_iterations_missed_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard, + '{{ rule_group }} {{ strategy }}', ) ) .addPanel( - g.panel('Rule Group Evlauations Too Slow') + + g.panel('Rule Group Evaluations Too Slow') + g.queryPanel( ||| (