From 4e41119815f76791c3a614e6232bd9a23c2808f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Brigitte?= Date: Tue, 26 Nov 2024 14:20:37 +0100 Subject: [PATCH] Add `MimirRulerTooManyFailedQueries` (#1439) --- CHANGELOG.md | 1 + .../atlas/alerting-rules/mimir.rules.yml | 21 +++++++++++ .../atlas/alerting-rules/mimir.rules.test.yml | 37 +++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 004e0a82..f07fc10c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. +- Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index aa1344cc..55c3d1f6 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -151,6 +151,27 @@ spec: severity: page team: atlas topic: observability + - alert: MimirRulerTooManyFailedQueries + annotations: + dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler + description: '{{`Mimir Ruler {{ $labels.pod }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.`}}' + opsrecipe: mimir/ + expr: | + 100 * ( + sum by (installation, cluster_id, pipeline, provider, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (installation, cluster_id, pipeline, provider, namespace, pod) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - name: mimir.compactor rules: - alert: MimirCompactorFailedCompaction diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index ac475454..541cee21 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -629,3 +629,40 @@ tests: dashboard: 8280707b8f16e7b87b840fc1cc92d4c5/mimir-writes description: "Mimir object storage write rate is down." opsrecipe: "mimir/" + + # Test for MimirRulerTooManyFailedQueries alert + - interval: 1m + input_series: + - series: 'cortex_ruler_queries_total{cluster_id="myinstall", installation="myinstall", namespace="mimir", pipeline="stable", pod="mimir-ruler-aaaaaaaaaa-bbbbb", provider="capa"}' + values: "0x90 0+1x90 90+100x90" + - series: 'cortex_ruler_queries_failed_total{cluster_id="myinstall", installation="myinstall", namespace="mimir", pipeline="stable", pod="mimir-ruler-aaaaaaaaaa-bbbbb", provider="capa", name="myinstall", type="ControlPlaneReady", status="True"}' + values: "0x180 0+2x90" + alert_rule_test: + - alertname: MimirRulerTooManyFailedQueries + eval_time: 90m + - alertname: MimirRulerTooManyFailedQueries + eval_time: 180m + - alertname: MimirRulerTooManyFailedQueries + eval_time: 240m + - alertname: MimirRulerTooManyFailedQueries + eval_time: 242m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: myinstall + installation: myinstall + namespace: mimir + pipeline: stable + pod: mimir-ruler-aaaaaaaaaa-bbbbb + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler + description: "Mimir Ruler mimir-ruler-aaaaaaaaaa-bbbbb is experiencing 2.00% errors while evaluating rules." + opsrecipe: "mimir/"