Skip to content

Commit

Permalink
Merge branch 'main' into add-trivy-rules
Browse files Browse the repository at this point in the history
  • Loading branch information
fhielpos authored Nov 26, 2024
2 parents d076efa + 4e41119 commit 94e87ef
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 1 deletion.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers.
- Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`.
- Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app.
- Add alert to monitor `Trivy` pod restarts.
- Add alert to monitor Shield pods restarts.
- Add `MimirRulerTooManyFailedQueries` alert to detect when Mimir ruler is failing to evaluate rules


### Fixed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,27 @@ spec:
severity: page
team: atlas
topic: observability
- alert: MimirRulerTooManyFailedQueries
annotations:
dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
description: '{{`Mimir Ruler {{ $labels.pod }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.`}}'
opsrecipe: mimir/
expr: |
100 * (
sum by (installation, cluster_id, pipeline, provider, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m]))
/
sum by (installation, cluster_id, pipeline, provider, namespace, pod) (rate(cortex_ruler_queries_total[1m]))
) > 1
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- name: mimir.compactor
rules:
- alert: MimirCompactorFailedCompaction
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,40 @@ tests:
dashboard: 8280707b8f16e7b87b840fc1cc92d4c5/mimir-writes
description: "Mimir object storage write rate is down."
opsrecipe: "mimir/"

# Test for MimirRulerTooManyFailedQueries alert
- interval: 1m
input_series:
- series: 'cortex_ruler_queries_total{cluster_id="myinstall", installation="myinstall", namespace="mimir", pipeline="stable", pod="mimir-ruler-aaaaaaaaaa-bbbbb", provider="capa"}'
values: "0x90 0+1x90 90+100x90"
- series: 'cortex_ruler_queries_failed_total{cluster_id="myinstall", installation="myinstall", namespace="mimir", pipeline="stable", pod="mimir-ruler-aaaaaaaaaa-bbbbb", provider="capa", name="myinstall", type="ControlPlaneReady", status="True"}'
values: "0x180 0+2x90"
alert_rule_test:
- alertname: MimirRulerTooManyFailedQueries
eval_time: 90m
- alertname: MimirRulerTooManyFailedQueries
eval_time: 180m
- alertname: MimirRulerTooManyFailedQueries
eval_time: 240m
- alertname: MimirRulerTooManyFailedQueries
eval_time: 242m
exp_alerts:
- exp_labels:
area: platform
cancel_if_outside_working_hours: "true"
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cluster_id: myinstall
installation: myinstall
namespace: mimir
pipeline: stable
pod: mimir-ruler-aaaaaaaaaa-bbbbb
provider: capa
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
description: "Mimir Ruler mimir-ruler-aaaaaaaaaa-bbbbb is experiencing 2.00% errors while evaluating rules."
opsrecipe: "mimir/"

0 comments on commit 94e87ef

Please sign in to comment.