diff --git a/CHANGELOG.md b/CHANGELOG.md index 004e0a82..f07fc10c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. - Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. +- Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules ### Fixed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index aa1344cc..a0e8b7f1 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -295,4 +295,25 @@ spec: severity: page team: atlas topic: observability + - alert: MimirRulerTooManyFailedQueries + annotations: + dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler + description: '{{`Mimir Ruler {{ $labels.pod }} in ${{ labels.cluster_id }} cluster is experiencing {{ printf "%.2f" $value }}$ errors while evaluating rules.`}}' + opsrecipe: mimir/ + expr: | + 100 * ( + sum by (cluster_id, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (cluster_id, namespace, pod) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability {{- end }}