From 14b403c59dcd775f1bad9a0f531cb6c50d8c7552 Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Tue, 19 Nov 2024 22:25:31 +0100 Subject: [PATCH] Add alerts to monitor the HelmReleases for cilium and coredns --- CHANGELOG.md | 4 ++++ .../cabbage/alerting-rules/cilium.rules.yml | 18 +++++++++++++++++- .../cabbage/alerting-rules/coredns.rules.yml | 17 +++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8e79e800..4186d567f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. + ## [4.26.1] - 2024-11-19 ### Changed diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml index 261aed918..83cd05c9a 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml @@ -59,4 +59,20 @@ spec: severity: page team: cabbage topic: cilium - + - alert: FluxHelmReleaseFailed + annotations: + description: |- + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} + opsrecipe: fluxcd-failing-helmrelease/ + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(cilium|network-policies)"} > 0 + for: 20m + labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_monitoring_agent_down: "true" + severity: page + team: cabbage + topic: cilium + namespace: |- + {{`{{ $labels.exported_namespace }}`}} diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml index 419d67c78..0a9424235 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml @@ -25,6 +25,23 @@ spec: severity: page team: cabbage topic: dns + - alert: FluxHelmReleaseFailed + annotations: + description: |- + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} + opsrecipe: fluxcd-failing-helmrelease/ + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*coredns"} > 0 + for: 20m + labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_monitoring_agent_down: "true" + severity: page + team: cabbage + topic: dns + namespace: |- + {{`{{ $labels.exported_namespace }}`}} - alert: CoreDNSMaxHPAReplicasReached expr: | (