From 1b2316e3720ccfcc5278c92e21ea4b1494352148 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Fri, 29 Sep 2023 12:42:23 +0200 Subject: [PATCH 1/2] handover VPA alert to turtle/phoenix --- CHANGELOG.md | 2 ++ .../templates/alerting-rules/vpa.all.rules.yml | 4 ++-- .../templates/recording-rules/service-level.rules.yml | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 979164628..a4a500222 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Change ownership from Atlas to Turtles/Phoenix for all vertical pod autoscaler alerts. + ### Changed - Handover cert-manager alerts to BigMac diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml index 7ec84de1f..8f02dcf68 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml @@ -19,7 +19,7 @@ spec: 1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 for: 10m labels: - area: managedservices + area: kaas cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -27,5 +27,5 @@ spec: cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page - team: atlas + team: phoenix topic: observability diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 1accd0203..f289a5738 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -331,10 +331,10 @@ spec: # -- VPA # Amount of requests for VPA - - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "atlas", "", "") + - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM - area: platform + area: kaas service: vertical-pod-autoscaler record: raw_slo_requests @@ -344,10 +344,10 @@ spec: # and summed with 1 so the final result is 0 : no error recorded. # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed # with 1 so the final result is 1 : 1 error is recorded . - - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "atlas", "", "") + - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM - area: platform + area: kaas service: vertical-pod-autoscaler record: raw_slo_errors From 6ef6e2d03900613018efefa53723ee32b744dfab Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Fri, 29 Sep 2023 14:15:22 +0200 Subject: [PATCH 2/2] Add WorkloadClusterWebhookDurationExceedsTimeoutPhoenix for vpa webhook and remove it from atlas --- .../apiserver.workload-cluster.rules.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml index 8964d3970..8f1ab06a9 100644 --- a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml @@ -84,12 +84,26 @@ spec: team: cabbage topic: kubernetes + # Webhooks owned by Phoenix + - alert: WorkloadClusterWebhookDurationExceedsTimeoutPhoenix + annotations: + description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' + opsrecipe: apiserver-admission-webhook-errors/ + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(vpa.k8s.io).*"}[5m])) by (cluster_id, name, app, le)) > 5 + for: 15m + labels: + area: kaas + cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} + severity: page + team: phoenix + topic: kubernetes + # Webhooks owned by Atlas - alert: WorkloadClusterWebhookDurationExceedsTimeoutAtlas annotations: description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' opsrecipe: apiserver-admission-webhook-errors/ - expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus|vpa.k8s.io).*"}[5m])) by (cluster_id, name, app, le)) > 5 + expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="workload_cluster", name=~".*(prometheus).*"}[5m])) by (cluster_id, name, app, le)) > 5 for: 15m labels: area: kaas