From 5d9f2a819911311cb7b5fc6794e4278f364caf99 Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Tue, 19 Nov 2024 23:10:28 +0100 Subject: [PATCH] Add alerts for azure cloud components HelmReleases --- CHANGELOG.md | 1 + .../aws-cloud-components.rules.yml | 6 ++-- .../azure-cloud-components.rules.yml | 32 +++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index f6ac2d6a..aff36d22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add `aws-cloud-components.rules` to monitor the AWS cloud-controller and the ebs-csi-driver. +- Add `azure-cloud-components.rules` to monitor the Azure cloud-controller and the azure csi drivers. ## [4.26.1] - 2024-11-19 diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml index a390737e..c77ea62d 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml @@ -10,12 +10,12 @@ metadata: namespace: {{ .Values.namespace }} spec: groups: - - name: aws-cloud-controller-manager + - name: aws-cloud-components rules: - alert: FluxHelmReleaseFailed annotations: description: |- - {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} opsrecipe: fluxcd-failing-helmrelease/ expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(aws-ebs-csi-driver|cloud-provider-aws)"} > 0 for: 20m @@ -28,5 +28,5 @@ spec: team: phoenix topic: managementcluster namespace: |- - {{`{{ $labels.exported_namespace }}`}} + {{`{{ $labels.exported_namespace }}`}} {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml new file mode 100644 index 00000000..e17f0029 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml @@ -0,0 +1,32 @@ +{{- if eq .Values.managementCluster.provider.kind "capz" }} +# This rule applies to capa management clusters only +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: azure-cloud-components.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: azure-cloud-components + rules: + - alert: FluxHelmReleaseFailed + annotations: + description: |- + {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} + opsrecipe: fluxcd-failing-helmrelease/ + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(azure-cloud-controller-manager|azure-cloud-node-manager|azuredisk-csi-driver|azurefile-csi-driver)"} > 0 + for: 20m + labels: + area: kaas + cancel_if_outside_working_hours: "true" + cancel_if_kube_state_metrics_down: "true" + cancel_if_monitoring_agent_down: "true" + severity: page + team: phoenix + topic: managementcluster + namespace: |- + {{`{{ $labels.exported_namespace }}`}} +{{- end }}