From 08fe22119882d24ef23a454df5e34c05b1bcc7f6 Mon Sep 17 00:00:00 2001 From: Herve Nicol <12008875+hervenicol@users.noreply.github.com> Date: Mon, 21 Oct 2024 17:24:57 +0200 Subject: [PATCH] Add InhibitionClusterWithoutWorkerNodes for CAPA --- CHANGELOG.md | 4 ++ .../alerting-rules/capa.inhibition.rules.yml | 34 +++++++++++++++++ .../capa.inhibition.rules.test.yml | 38 +++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml create mode 100644 test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d8c2850..6d5540b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added InhibitionClusterWithoutWorkerNodes for CAPA + ### Changed - Modify `KyvernoWebhookHasNoAvailableReplicas` to check specifically for Kyverno resource webhook. diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml new file mode 100644 index 00000000..b5767549 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml @@ -0,0 +1,34 @@ +{{- if eq .Values.managementCluster.provider.kind "capa" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + cluster_type: "management_cluster" + name: capa.inhibitions.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: capa.inhibitions + rules: + - alert: InhibitionClusterWithoutWorkerNodes + annotations: + description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' + expr: |- + label_replace( + capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, + "cluster_id", + "$1", + "name", + "(.*)" + ) == 1 + unless on (cluster_id) ( + sum(capi_machinepool_spec_replicas{} > 0) by (cluster_id) + ) + labels: + area: kaas + has_worker_nodes: "false" + team: phoenix + topic: status +{{- end }} diff --git a/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml b/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml new file mode 100644 index 00000000..02a12a05 --- /dev/null +++ b/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml @@ -0,0 +1,38 @@ +--- +rule_files: +- capa.inhibition.rules.yml + +tests: + # Tests for `InhibitionClusterWithoutWorkerNodes` inhibition alert + - interval: 1m + input_series: + - series: 'capi_cluster_status_condition{cluster_id="golem", cluster_type="management_cluster", name="golem", pipeline="testing", status="True", type="ControlPlaneReady"}' + values: "1+0x300" + - series: 'capi_machinepool_spec_replicas{cluster_id="golem", cluster_name="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", organization="giantswarm", pipeline="testing", provider="capa"}' + values: "_x60 0x60 3x60" + alert_rule_test: + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 30m + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 90m + - alertname: InhibitionClusterWithoutWorkerNodes + eval_time: 150m + - alertname: PrometheusAgentFailing + eval_time: 30m + exp_alerts: + - exp_labels: + area: platform + severity: page + team: atlas + topic: observability + inhibit_prometheus_agent_down: "true" + instance: prometheus-agent + cancel_if_cluster_has_no_workers: "true" + cancel_if_cluster_is_not_running_prometheus_agent: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + exp_annotations: + dashboard: "promRW001/prometheus-remote-write" + description: "Prometheus agent remote write is failing." + opsrecipe: "prometheus-agent/" + summary: "Prometheus agent fails to send samples to remote write endpoint."