From 04969c16d4fc6f11608c89fec5cf7396853563c3 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 6 Jun 2024 11:01:52 +0200
Subject: [PATCH] review-phoenix-inhibitions (#1212)

---
 CHANGELOG.md                                  |  5 ++
 .../alerting-rules/inhibit.all.rules.yml      | 60 -------------------
 ... inhibit.aws.management-cluster.rules.yml} | 17 +++---
 .../alerting-rules/inhibit.kiam.rules.yml     | 25 ++++++++
 .../cluster-autoscaler.rules.yml              | 15 +++--
 .../alerting-rules/inhibit.capi.rules.yml     | 30 ++++++++++
 .../alerting-rules/inhibit.kubelet.rules.yml  | 22 +++++++
 .../vertical-pod-autoscaler.rules.yml}        |  8 +--
 .../alerting-rules/inhibit.oncall.rules.yml   | 21 +++++++
 .../alerting-rules/network.all.rules.yml      |  1 -
 10 files changed, 123 insertions(+), 81 deletions(-)
 delete mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml
 rename helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/{inhibit.management-cluster.rules.yml => inhibit.aws.management-cluster.rules.yml} (90%)
 create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
 rename helm/prometheus-rules/templates/kaas/{phoenix => turtles}/alerting-rules/cluster-autoscaler.rules.yml (78%)
 create mode 100644 helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
 create mode 100644 helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
 rename helm/prometheus-rules/templates/kaas/{phoenix/alerting-rules/vpa.all.rules.yml => turtles/alerting-rules/vertical-pod-autoscaler.rules.yml} (91%)
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 69ffe1932..a31386edc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor.
 - Add `CiliumAPITooSlow`.
 
+### Changed
+
+- Review phoenix alerts towards Mimir.
+- Moves cluster-autoscaler and vpa alerts to turtles.
+
 ### Fixed
 
 - Fix cabbage alerts for multi-provider wcs.
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml
deleted file mode 100644
index 734d0121c..000000000
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
-  creationTimestamp: null
-  labels:
-    {{- include "labels.common" . | nindent 4 }}
-  name: inhibit.all.rules
-  namespace: {{ .Values.namespace  }}
-spec:
-  groups:
-  - name: inhibit.all
-    rules:
-    - alert: InhibitionOutsideWorkingHours
-      annotations:
-        description: '{{`Fires outside working hours.`}}'
-      expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1)
-      labels:
-        area: empowerment
-        nodes_down: "true"
-        outside_working_hours: "true"
-        team: phoenix
-        topic: monitoring
-    - alert: InhibitionKubeletDown
-      expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
-      labels:
-        kubelet_down: "true"
-        area: kaas
-        topic: kubernetes
-      annotations:
-        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
-    # TODO(@team-turtles): fix with real expr
-    - alert: ScrapeTimeout
-      annotations:
-        description: '{{`Never fires (dummy alert).`}}'
-      expr: vector(0) > 1
-      labels:
-        area: empowerment
-        scrape_timeout: "true"
-        team: phoenix
-        topic: monitoring
-    {{- if (eq .Values.managementCluster.provider.kind "aws") }}
-    - alert: InhibitionClusterWithoutWorkerNodes
-      annotations:
-        description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
-      expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
-      labels:
-        area: kaas
-        has_worker_nodes: "false"
-        team: phoenix
-        topic: status
-    - alert: InhibitionKiamErrors
-      annotations:
-        description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
-      expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0
-      labels:
-        area: kaas
-        kiam_has_errors: "true"
-        team: phoenix
-        topic: kiam
-    {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml
similarity index 90%
rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml
rename to helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml
index 0bfc3fe2e..b29069f6e 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml
@@ -1,3 +1,5 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws management clusters
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
@@ -5,14 +7,13 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    # No need for .Values.mimir.enabled condition - will be gone with Vintage
     cluster_type: "management_cluster"
-{{- end }}
-  name: inhibit.management-cluster.rules
+  name: inhibit.aws.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: inhibit.management-cluster
+  - name: inhibit.aws.management-cluster
     rules:
     - alert: InhibitionClusterStatusCreating
       annotations:
@@ -95,13 +96,13 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
-    - alert: InhibitionControlPlaneUnhealthy
+    - alert: InhibitionClusterWithoutWorkerNodes
       annotations:
-        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
-      expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
+        description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
+      expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
       labels:
         area: kaas
-        cluster_control_plane_unhealthy: "true"
+        has_worker_nodes: "false"
         team: phoenix
         topic: status
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
new file mode 100644
index 000000000..fe8678e35
--- /dev/null
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
@@ -0,0 +1,25 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws clusters
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.kiam.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.kiam
+    rules:
+    - alert: InhibitionKiamErrors
+      annotations:
+        description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
+      expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0
+      labels:
+        area: kaas
+        kiam_has_errors: "true"
+        team: phoenix
+        topic: kiam
+{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
similarity index 78%
rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml
rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
index 313950683..c47475cb5 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
@@ -1,4 +1,4 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+# This rule applies to all cloud workload clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -18,31 +18,30 @@ spec:
       annotations:
         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}'
         opsrecipe: cluster-autoscaler-scaling/
-      expr: cluster_autoscaler_unneeded_nodes_count > 0
+      expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"} > 0
       for: 240m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: "true"
         cancel_if_cluster_has_no_workers: "true"
         severity: page
-        team: phoenix
+        team: turtles
         topic: cluster-autoscaler
     - alert: ClusterAutoscalerFailedScaling
       annotations:
         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up.`}}'
         opsrecipe: cluster-autoscaler-scaling/
-      expr: increase(cluster_autoscaler_failed_scale_ups_total[5m]) > 1
+      expr: increase(cluster_autoscaler_failed_scale_ups_total{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"}[5m]) > 1
       for: 15m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: phoenix
+        team: turtles
         topic: cluster-autoscaler
-{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
new file mode 100644
index 000000000..354db1a61
--- /dev/null
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
@@ -0,0 +1,30 @@
+# This rule applies to all capi management clusters
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+{{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+{{- end }}
+  name: inhibit.capi.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.capi
+    rules:
+    - alert: InhibitionControlPlaneUnhealthy
+      annotations:
+        description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
+      expr: |-
+        capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="ControlPlaneComponentsHealthy", status="False"} == 1
+        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1
+        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1
+      labels:
+        area: kaas
+        cluster_control_plane_unhealthy: "true"
+        team: turtles
+        topic: status
+{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
new file mode 100644
index 000000000..3a2653732
--- /dev/null
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
@@ -0,0 +1,22 @@
+# This rule applies to all clusters
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.kubelet.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.kubelet
+    rules:
+    - alert: InhibitionKubeletDown
+      annotations:
+        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
+      expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
+      labels:
+        kubelet_down: "true"
+        area: kaas
+        topic: kubernetes
+        team: turtles
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml
similarity index 91%
rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml
rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml
index 23945ff8f..2bb8784d8 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml
@@ -3,11 +3,11 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: vpa.all.rules
+  name: vertical-pod-autoscaler.rules
   namespace: {{ .Values.namespace }}
 spec:
   groups:
-  - name: vpa
+  - name: vertical-pod-autoscaler
     rules:
     - alert: VpaComponentTooManyRestarts
       annotations:
@@ -27,5 +27,5 @@ spec:
         cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: phoenix
-        topic: observability
+        team: turtles
+        topic: autoscaling
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml
new file mode 100644
index 000000000..02d15d9f1
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml
@@ -0,0 +1,21 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: inhibit.oncall.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: inhibit.oncall
+    rules:
+    - alert: InhibitionOutsideWorkingHours
+      annotations:
+        description: '{{`Fires outside working hours.`}}'
+      expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1)
+      labels:
+        area: platform
+        outside_working_hours: "true"
+        team: atlas
+        topic: monitoring
diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml
index fbee4e53a..140673d5c 100644
--- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml
+++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml
@@ -59,7 +59,6 @@ spec:
         cancel_if_cluster_with_scaling_nodepools: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         cancel_if_cluster_has_no_workers: "true"
-        cancel_if_nodes_down: "true"
         severity: page
         team: {{ include "providerTeam" . }}
         topic: network