From 1b2316e3720ccfcc5278c92e21ea4b1494352148 Mon Sep 17 00:00:00 2001
From: Theo Brigitte <theo.brigitte@gmail.com>
Date: Fri, 29 Sep 2023 12:42:23 +0200
Subject: [PATCH 1/2] handover VPA alert to turtle/phoenix

---
 CHANGELOG.md                                              | 2 ++
 .../templates/alerting-rules/vpa.all.rules.yml            | 4 ++--
 .../templates/recording-rules/service-level.rules.yml     | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 979164628..a4a500222 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- Change ownership from Atlas to Turtles/Phoenix for all vertical pod autoscaler alerts.
+
 ### Changed
 
 - Handover cert-manager alerts to BigMac
diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml
index 7ec84de1f..8f02dcf68 100644
--- a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml
@@ -19,7 +19,7 @@ spec:
         1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98
       for: 10m
       labels:
-        area: managedservices
+        area: kaas
         cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -27,5 +27,5 @@ spec:
         cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: atlas
+        team: phoenix
         topic: observability
diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml
index 1accd0203..f289a5738 100644
--- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml
+++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml
@@ -331,10 +331,10 @@ spec:
 
     # -- VPA
     # Amount of requests for VPA
-    - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "atlas", "", "")
+    - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "phoenix", "", "")
       labels:
         class: MEDIUM
-        area: platform
+        area: kaas
         service: vertical-pod-autoscaler
       record: raw_slo_requests
 
@@ -344,10 +344,10 @@ spec:
     # and summed with 1 so the final result is 0 : no error recorded.
     # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed
     # with 1 so the final result is 1 : 1 error is recorded .
-    - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "atlas", "", "")
+    - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "phoenix", "", "")
       labels:
         class: MEDIUM
-        area: platform
+        area: kaas
         service: vertical-pod-autoscaler
       record: raw_slo_errors
 

From 6ef6e2d03900613018efefa53723ee32b744dfab Mon Sep 17 00:00:00 2001
From: Theo Brigitte <theo.brigitte@gmail.com>
Date: Fri, 29 Sep 2023 14:15:22 +0200
Subject: [PATCH 2/2] Add WorkloadClusterWebhookDurationExceedsTimeoutPhoenix
 for vpa webhook and remove it from atlas

---
 .../apiserver.workload-cluster.rules.yml         | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
index 8964d3970..8f1ab06a9 100644
--- a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
@@ -84,12 +84,26 @@ spec:
         team: cabbage
         topic: kubernetes
 
+      # Webhooks owned by Phoenix
+    - alert: WorkloadClusterWebhookDurationExceedsTimeoutPhoenix
+      annotations:
+        description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}'
+        opsrecipe: apiserver-admission-webhook-errors/
+      expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, name, app, le)) > 5
+      for: 15m
+      labels:
+        area: kaas
+        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
+        severity: page
+        team: phoenix
+        topic: kubernetes
+
       # Webhooks owned by Atlas
     - alert: WorkloadClusterWebhookDurationExceedsTimeoutAtlas
       annotations:
         description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}'
         opsrecipe: apiserver-admission-webhook-errors/
-      expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus|vpa.k8s.io).*"}[5m])) by (cluster_id, name, app, le)) > 5
+      expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, name, app, le)) > 5
       for: 15m
       labels:
         area: kaas