From c3d9f2a1a5d909ce27bcbd3fc63a8d39bc536b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Nicol?= Date: Tue, 22 Oct 2024 09:55:26 +0200 Subject: [PATCH] Add InhibitionClusterWithoutWorkerNodes for CAPA (#1397) Co-authored-by: Herve Nicol <12008875+hervenicol@users.noreply.github.com> Co-authored-by: Quentin Bisson --- CHANGELOG.md | 4 ++ .../alerting-rules/capa.inhibition.rules.yml | 34 ++++++++++++++ .../capa.inhibition.rules.test.yml | 47 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml create mode 100644 test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d8c2850..6d5540b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added InhibitionClusterWithoutWorkerNodes for CAPA + ### Changed - Modify `KyvernoWebhookHasNoAvailableReplicas` to check specifically for Kyverno resource webhook. diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml new file mode 100644 index 00000000..b5767549 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml @@ -0,0 +1,34 @@ +{{- if eq .Values.managementCluster.provider.kind "capa" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + cluster_type: "management_cluster" + name: capa.inhibitions.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: capa.inhibitions + rules: + - alert: InhibitionClusterWithoutWorkerNodes + annotations: + description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' + expr: |- + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + unless on (cluster_id) ( + sum(capi_machinepool_spec_replicas{} > 0) by (cluster_id) + ) + labels: + area: kaas + has_worker_nodes: "false" + team: phoenix + topic: status +{{- end }} diff --git a/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml b/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml new file mode 100644 index 00000000..0615f71b --- /dev/null +++ b/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml @@ -0,0 +1,47 @@ +--- +rule_files: +- capa.inhibition.rules.yml + +tests: + # Tests for `InhibitionClusterWithoutWorkerNodes` inhibition alert + - interval: 1m + input_series: + - series: 'capi_cluster_status_condition{cluster_id="golem", cluster_type="management_cluster", name="golem", pipeline="testing", status="True", type="ControlPlaneReady"}' + values: "1+0x300" + - series: 'capi_machinepool_spec_replicas{cluster_id="golem", cluster_name="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", organization="giantswarm", pipeline="testing", provider="capa"}' + values: "_x60 0x60 3x60" + alert_rule_test: + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 30m + exp_alerts: + - exp_labels: + area: kaas + cluster_id: "golem" + cluster_type: "management_cluster" + has_worker_nodes: "false" + name: "golem" + pipeline: "testing" + status: "True" + team: "phoenix" + topic: "status" + type: "ControlPlaneReady" + exp_annotations: + description: "Cluster (golem) has no worker nodes." + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 90m + exp_alerts: + - exp_labels: + area: kaas + cluster_id: "golem" + cluster_type: "management_cluster" + has_worker_nodes: "false" + name: "golem" + pipeline: "testing" + status: "True" + team: "phoenix" + topic: "status" + type: "ControlPlaneReady" + exp_annotations: + description: "Cluster (golem) has no worker nodes." + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 150m