diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a1604ea..a53f5567 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add alert to monitor the HelmRelease for vertical-pod-autoscaler-crd app. +- Add alerts to monitor the `HelmReleases` for `cilium` and `coredns`. +- Add alert to monitor the `HelmRelease` for the `vertical-pod-autoscaler-crd` app. ## [4.26.1] - 2024-11-19 diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml index 261aed91..fb29bf7c 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml @@ -59,4 +59,22 @@ spec: severity: page team: cabbage topic: cilium - + {{- if eq .Values.managementCluster.provider.flavor "capi" }} + - alert: FluxHelmReleaseFailed + annotations: + description: |- + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}-{{ $labels.cluster_id }} is stuck in Failed state.`}} + opsrecipe: fluxcd-failing-helmrelease/ + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(cilium|network-policies)"} > 0 + for: 20m + labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_monitoring_agent_down: "true" + severity: page + team: cabbage + topic: cilium + namespace: |- + {{`{{ $labels.exported_namespace }}`}} + {{- end -}} diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml index 419d67c7..09ceb4da 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml @@ -25,6 +25,25 @@ spec: severity: page team: cabbage topic: dns + {{- if eq .Values.managementCluster.provider.flavor "capi" }} + - alert: FluxHelmReleaseFailed + annotations: + description: |- + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}-{{ $labels.cluster_id }} is stuck in Failed state.`}} + opsrecipe: fluxcd-failing-helmrelease/ + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*coredns"} > 0 + for: 20m + labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_monitoring_agent_down: "true" + severity: page + team: cabbage + topic: dns + namespace: |- + {{`{{ $labels.exported_namespace }}`}} + {{- end }} - alert: CoreDNSMaxHPAReplicasReached expr: | (