From 5ea26c7421413b543c485c043d8a999f090830ec Mon Sep 17 00:00:00 2001 From: Jose Armesto Date: Wed, 20 Nov 2024 14:51:37 +0100 Subject: [PATCH] Use generic alert for all providers --- CHANGELOG.md | 3 +- .../azure-cloud-components.rules.yml | 32 ------------------- ...ml => cloud-provider-controller.rules.yml} | 23 +++++++++---- 3 files changed, 18 insertions(+), 40 deletions(-) delete mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml rename helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/{aws-cloud-components.rules.yml => cloud-provider-controller.rules.yml} (54%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a96c195..ec246a20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add `aws-cloud-components.rules` to monitor the AWS cloud-controller and the ebs-csi-driver. -- Add `azure-cloud-components.rules` to monitor the Azure cloud-controller and the azure csi drivers. +- Add `cloud-provider-controller.rules` to monitor the cloud-provider-controller components across providers. - Add alert to monitor the HelmRelease for vertical-pod-autoscaler-crd app. ## [4.26.1] - 2024-11-19 diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml deleted file mode 100644 index e17f0029..00000000 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/azure-cloud-components.rules.yml +++ /dev/null @@ -1,32 +0,0 @@ -{{- if eq .Values.managementCluster.provider.kind "capz" }} -# This rule applies to capa management clusters only -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: azure-cloud-components.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: azure-cloud-components - rules: - - alert: FluxHelmReleaseFailed - annotations: - description: |- - {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} - opsrecipe: fluxcd-failing-helmrelease/ - expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(azure-cloud-controller-manager|azure-cloud-node-manager|azuredisk-csi-driver|azurefile-csi-driver)"} > 0 - for: 20m - labels: - area: kaas - cancel_if_outside_working_hours: "true" - cancel_if_kube_state_metrics_down: "true" - cancel_if_monitoring_agent_down: "true" - severity: page - team: phoenix - topic: managementcluster - namespace: |- - {{`{{ $labels.exported_namespace }}`}} -{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cloud-provider-controller.rules.yml similarity index 54% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml rename to helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cloud-provider-controller.rules.yml index c77ea62d..5a218b53 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-cloud-components.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cloud-provider-controller.rules.yml @@ -1,23 +1,34 @@ -{{- if eq .Values.managementCluster.provider.kind "capa" }} -# This rule applies to capa management clusters only +{{- if eq .Values.managementCluster.provider.flavor "capi" }} +# This rule applies to CAPI management clusters only +{{- define "cloudProviderControllerComponents" -}} +- aws-ebs-csi-driver +- cloud-provider-aws +- azure-cloud-controller-manager +- azure-cloud-node-manager +- azuredisk-csi-driver +- azurefile-csi-driver +- cloud-provider-vsphere +- cloud-provider-cloud-director +{{- end }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} - name: aws-cloud-components.rules + name: cloud-provider-controller.rules namespace: {{ .Values.namespace }} spec: groups: - - name: aws-cloud-components + - name: cloud-provider-controller rules: - alert: FluxHelmReleaseFailed annotations: description: |- {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} opsrecipe: fluxcd-failing-helmrelease/ - expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*(aws-ebs-csi-driver|cloud-provider-aws)"} > 0 + # Here we take the list of components from the cloudProviderControllerComponents template function and transform it into a |-separated string, which is suitable for the PromQL query + expr: gotk_reconcile_condition{type="Ready", status="False", kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", name=~".*{{ (include "cloudProviderControllerComponents" . | fromYaml | join "\\|") }}"} > 0 for: 20m labels: area: kaas @@ -25,7 +36,7 @@ spec: cancel_if_kube_state_metrics_down: "true" cancel_if_monitoring_agent_down: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: managementcluster namespace: |- {{`{{ $labels.exported_namespace }}`}}