Skip to content

Commit

Permalink
mixin(Rule): Add rule evaluation failures to the Rule dashboard (than…
Browse files Browse the repository at this point in the history
…os-io#6244)

* Improve Thanos Rule dashboard legends

Signed-off-by: Douglas Camata <[email protected]>

* Add evaluations failed to Rule dashboard

Signed-off-by: Douglas Camata <[email protected]>

* Refactor rule dashboard

Signed-off-by: Douglas Camata <[email protected]>

* Add changelog entry

Signed-off-by: Douglas Camata <[email protected]>

* Rerun CI

Signed-off-by: Douglas Camata <[email protected]>

---------

Signed-off-by: Douglas Camata <[email protected]>
  • Loading branch information
douglascamata authored and HC Zhu committed Jun 27, 2023
1 parent 271ad0e commit 18bc3fa
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 29 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re
- [#6107](https://github.com/thanos-io/thanos/pull/6082) Change default user id in container image from 0(root) to 1001
- [#6228](https://github.com/thanos-io/thanos/pull/6228) Conditionally generate debug messages in ProxyStore to avoid memory bloat.
- [#6231](https://github.com/thanos-io/thanos/pull/6231) mixins: Add code/grpc-code dimension to error widgets.
- [#6244](https://github.com/thanos-io/thanos/pull/6244) mixin(Rule): Add rule evaluation failures to the Rule dashboard.

### Removed

Expand Down
124 changes: 100 additions & 24 deletions examples/dashboards/rule.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))",
"expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluations_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ strategy }}",
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
Expand Down Expand Up @@ -116,23 +116,23 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))",
"expr": "sum by (job, rule_group, strategy) (rate(prometheus_rule_evaluation_failures_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ strategy }}",
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evaluations Missed",
"title": "Rule Group Evaluations Failed",
"tooltip": {
"shared": false,
"sort": 0,
Expand Down Expand Up @@ -192,7 +192,83 @@
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 4,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (job, rule_group, strategy) (increase(prometheus_rule_group_iterations_missed_total{job=~\"$job\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ rule_group }} {{ strategy }}",
"legendLink": null,
"step": 10
}
],
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evaluations Missed",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [ ]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": { },
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [ ],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [ ],
"spaceLength": 10,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
Expand All @@ -208,7 +284,7 @@
"thresholds": [ ],
"timeFrom": null,
"timeShift": null,
"title": "Rule Group Evlauations Too Slow",
"title": "Rule Group Evaluations Too Slow",
"tooltip": {
"shared": false,
"sort": 0,
Expand Down Expand Up @@ -261,7 +337,7 @@
"datasource": "$datasource",
"description": "Shows rate of dropped alerts.",
"fill": 1,
"id": 4,
"id": 5,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -338,7 +414,7 @@
"datasource": "$datasource",
"description": "Shows rate of alerts that successfully sent to alert manager.",
"fill": 10,
"id": 5,
"id": 6,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -417,7 +493,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of sent alerts.",
"fill": 10,
"id": 6,
"id": 7,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -493,7 +569,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to send alerts to alert manager.",
"fill": 1,
"id": 7,
"id": 8,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -623,7 +699,7 @@
"datasource": "$datasource",
"description": "Shows rate of queued alerts.",
"fill": 1,
"id": 8,
"id": 9,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -702,7 +778,7 @@
"datasource": "$datasource",
"description": "Shows ratio of dropped alerts compared to the total number of queued alerts.",
"fill": 10,
"id": 9,
"id": 10,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -790,7 +866,7 @@
"datasource": "$datasource",
"description": "Shows rate of handled Unary gRPC requests.",
"fill": 10,
"id": 10,
"id": 11,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -941,7 +1017,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of handled requests.",
"fill": 10,
"id": 11,
"id": 12,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1016,7 +1092,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to handle requests, in quantiles.",
"fill": 1,
"id": 12,
"id": 13,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1146,7 +1222,7 @@
"datasource": "$datasource",
"description": "Shows rate of handled Streamed gRPC requests.",
"fill": 10,
"id": 13,
"id": 14,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1297,7 +1373,7 @@
"datasource": "$datasource",
"description": "Shows ratio of errors compared to the total number of handled requests.",
"fill": 10,
"id": 14,
"id": 15,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1372,7 +1448,7 @@
"datasource": "$datasource",
"description": "Shows how long has it taken to handle requests, in quantiles",
"fill": 1,
"id": 15,
"id": 16,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1501,7 +1577,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 16,
"id": 17,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1617,7 +1693,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 17,
"id": 18,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1693,7 +1769,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 18,
"id": 19,
"legend": {
"avg": false,
"current": false,
Expand Down
18 changes: 13 additions & 5 deletions mixin/dashboards/rule.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ local utils = import '../lib/utils.libsonnet';
dashboard:: {
selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']),
dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']),
ruleGroupDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'rule_group', 'strategy']),
},
},
grafanaDashboards+:: {
Expand All @@ -22,19 +23,26 @@ local utils = import '../lib/utils.libsonnet';
.addPanel(
g.panel('Rule Group Evaluations') +
g.queryPanel(
'sum by (%s) (rate(prometheus_rule_evaluations_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector],
'{{ strategy }}',
'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evaluations Failed') +
g.queryPanel(
'sum by (%(ruleGroupDimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evaluations Missed') +
g.queryPanel(
'sum by (%s) (increase(prometheus_rule_group_iterations_missed_total{%s}[$__rate_interval]))' % [utils.joinLabels([thanos.rule.dashboard.dimensions, 'strategy']), thanos.rule.dashboard.selector],
'{{ strategy }}',
'sum by (%(ruleGroupDimensions)s) (increase(prometheus_rule_group_iterations_missed_total{%(selector)s}[$__rate_interval]))' % thanos.rule.dashboard,
'{{ rule_group }} {{ strategy }}',
)
)
.addPanel(
g.panel('Rule Group Evlauations Too Slow') +
g.panel('Rule Group Evaluations Too Slow') +
g.queryPanel(
|||
(
Expand Down

0 comments on commit 18bc3fa

Please sign in to comment.