From 16be638f80dcede8561b37f552d33fad993e029d Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 21 Aug 2023 17:22:42 +0100 Subject: [PATCH 1/8] move kube-state-metrics alerts from atlas to turtles --- .../kube-state-metrics.rules.yml | 22 +++++++++---------- .../templates/alerting-rules/up.all.rules.yml | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index 39a2fd571..f5719d149 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -26,7 +26,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "true" severity: page - team: atlas + team: turtles topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: @@ -39,7 +39,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeDaemonSetCreatedMetricMissing annotations: @@ -52,7 +52,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeDeploymentCreatedMetricMissing annotations: @@ -65,7 +65,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeEndpointCreatedMetricMissing annotations: @@ -78,7 +78,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeNamespaceCreatedMetricMissing annotations: @@ -91,7 +91,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeNodeCreatedMetricMissing annotations: @@ -104,7 +104,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubePodCreatedMetricMissing annotations: @@ -117,7 +117,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeReplicaSetCreatedMetricMissing annotations: @@ -130,7 +130,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeSecretCreatedMetricMissing annotations: @@ -143,7 +143,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes - alert: KubeServiceCreatedMetricMissing annotations: @@ -156,5 +156,5 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: atlas + team: turtles topic: kubernetes diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index afd099527..7dfddd54f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -74,5 +74,5 @@ spec: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page - team: atlas + team: turtles topic: observability From 946d53cd2e830a6681a36977fb5efad32a052295 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 21 Aug 2023 17:23:55 +0100 Subject: [PATCH 2/8] move vertical-pod-autoscaler alerts from atlas to turtles --- .../templates/alerting-rules/vpa.all.rules.yml | 2 +- .../templates/recording-rules/service-level.rules.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml index 7ec84de1f..b60604f38 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml @@ -27,5 +27,5 @@ spec: cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page - team: atlas + team: turtles topic: observability diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index 1accd0203..b95988fd7 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -331,7 +331,7 @@ spec: # -- VPA # Amount of requests for VPA - - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "atlas", "", "") + - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "turtles", "", "") labels: class: MEDIUM area: platform @@ -344,7 +344,7 @@ spec: # and summed with 1 so the final result is 0 : no error recorded. # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed # with 1 so the final result is 1 : 1 error is recorded . - - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "atlas", "", "") + - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "turtles", "", "") labels: class: MEDIUM area: platform From bdc1b22b7058bab4e57a2edb4ca7c5dc5d785551 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 21 Aug 2023 17:40:19 +0100 Subject: [PATCH 3/8] update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c20ce3fab..7a4e5bdee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change ownership from Atlas to Turtles for all vertical pod autoscaler and kube state metrics related alerts. + ## [2.127.0] - 2023-08-21 ### Changed From 59a15fe3894c159ad606d032181aebd1303856af Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Tue, 29 Aug 2023 15:43:23 +0100 Subject: [PATCH 4/8] replace turtles with phoenix --- .../kube-state-metrics.rules.yml | 22 +++++++++---------- .../templates/alerting-rules/up.all.rules.yml | 2 +- .../alerting-rules/vpa.all.rules.yml | 2 +- .../recording-rules/service-level.rules.yml | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml index f5719d149..ca4b3dc67 100644 --- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml @@ -26,7 +26,7 @@ spec: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: "true" severity: page - team: turtles + team: phoenix topic: observability - alert: KubeConfigMapCreatedMetricMissing annotations: @@ -39,7 +39,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeDaemonSetCreatedMetricMissing annotations: @@ -52,7 +52,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeDeploymentCreatedMetricMissing annotations: @@ -65,7 +65,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeEndpointCreatedMetricMissing annotations: @@ -78,7 +78,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeNamespaceCreatedMetricMissing annotations: @@ -91,7 +91,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeNodeCreatedMetricMissing annotations: @@ -104,7 +104,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubePodCreatedMetricMissing annotations: @@ -117,7 +117,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeReplicaSetCreatedMetricMissing annotations: @@ -130,7 +130,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeSecretCreatedMetricMissing annotations: @@ -143,7 +143,7 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes - alert: KubeServiceCreatedMetricMissing annotations: @@ -156,5 +156,5 @@ spec: cancel_if_outside_working_hours: "true" cancel_if_kube_state_metrics_down: "true" severity: page - team: turtles + team: phoenix topic: kubernetes diff --git a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml index 7dfddd54f..86dc4d335 100644 --- a/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/up.all.rules.yml @@ -74,5 +74,5 @@ spec: cancel_if_kubelet_down: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} severity: page - team: turtles + team: phoenix topic: observability diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml index b60604f38..136570986 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml @@ -27,5 +27,5 @@ spec: cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page - team: turtles + team: phoenix topic: observability diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index b95988fd7..be04def44 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -331,7 +331,7 @@ spec: # -- VPA # Amount of requests for VPA - - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "turtles", "", "") + - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM area: platform @@ -344,7 +344,7 @@ spec: # and summed with 1 so the final result is 0 : no error recorded. # If up was unsuccessful, there is an error. Up returns 0, multiplied by -1 and summed # with 1 so the final result is 1 : 1 error is recorded . - - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "turtles", "", "") + - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM area: platform From b15a610ab2b2bd872897f0a06813c795435aff1c Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Thu, 31 Aug 2023 11:15:38 +0200 Subject: [PATCH 5/8] fix KSMDown unit test --- test/tests/providers/global/up.all.rules.test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/tests/providers/global/up.all.rules.test.yml b/test/tests/providers/global/up.all.rules.test.yml index 88ed88926..073363322 100644 --- a/test/tests/providers/global/up.all.rules.test.yml +++ b/test/tests/providers/global/up.all.rules.test.yml @@ -58,7 +58,7 @@ tests: cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" severity: "page" - team: "atlas" + team: "phoenix" topic: "observability" exp_annotations: description: "KubeStateMetrics () is down." @@ -80,7 +80,7 @@ tests: cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" severity: "page" - team: "atlas" + team: "phoenix" topic: "observability" exp_annotations: description: "KubeStateMetrics () is down." @@ -108,7 +108,7 @@ tests: cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" severity: "page" - team: "atlas" + team: "phoenix" topic: "observability" exp_annotations: description: "KubeStateMetrics () is down." @@ -161,7 +161,7 @@ tests: cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" severity: "page" - team: "atlas" + team: "phoenix" topic: "observability" exp_annotations: description: "KubeStateMetrics () is down." @@ -183,7 +183,7 @@ tests: cancel_if_outside_working_hours: "false" inhibit_kube_state_metrics_down: "true" severity: "page" - team: "atlas" + team: "phoenix" topic: "observability" exp_annotations: description: "KubeStateMetrics () is down." From 9a8c3590e5b00661e7da822cce830796457ba45c Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 11 Sep 2023 16:47:09 +0200 Subject: [PATCH 6/8] update area to kaas --- .../templates/alerting-rules/vpa.all.rules.yml | 2 +- .../templates/recording-rules/service-level.rules.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml index 136570986..8f02dcf68 100644 --- a/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/vpa.all.rules.yml @@ -19,7 +19,7 @@ spec: 1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 for: 10m labels: - area: managedservices + area: kaas cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" diff --git a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml index be04def44..f289a5738 100644 --- a/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml +++ b/helm/prometheus-rules/templates/recording-rules/service-level.rules.yml @@ -334,7 +334,7 @@ spec: - expr: label_replace(count(up{app=~'vertical-pod-autoscaler.*'}) by (cluster_type,cluster_id), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM - area: platform + area: kaas service: vertical-pod-autoscaler record: raw_slo_requests @@ -347,7 +347,7 @@ spec: - expr: label_replace(sum((up{app=~'vertical-pod-autoscaler.*'} * -1) + 1) by (cluster_id, cluster_type), "label_application_giantswarm_io_team", "phoenix", "", "") labels: class: MEDIUM - area: platform + area: kaas service: vertical-pod-autoscaler record: raw_slo_errors From 7d3fcafd5208922183a3b293f976ef2d4d7dacac Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Tue, 16 Apr 2024 13:16:48 +0200 Subject: [PATCH 7/8] update CHANGELOG --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d3fdc7ac..e7fd9ba6c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change ownership from Atlas to Phoenix for all kube state metrics related alerts. + ## [3.11.0] - 2024-04-15 ### Added From 8e96b2eeb34111257e76a208e369fc2a3d722192 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Tue, 16 Apr 2024 13:17:25 +0200 Subject: [PATCH 8/8] update CHANGELOG --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7fd9ba6c..97f1a2597 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -496,7 +496,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Change ownership from Atlas to Turtles/Phoenix for all vertical pod autoscaler and kube state metrics related alerts. - Loki alerts only during working hours - `PrometheusAgentFailing` does not rely on KSM metrics anymore - Prometheus-agent inhibition rework, run on the MC