From 04969c16d4fc6f11608c89fec5cf7396853563c3 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 6 Jun 2024 11:01:52 +0200 Subject: [PATCH] review-phoenix-inhibitions (#1212) --- CHANGELOG.md | 5 ++ .../alerting-rules/inhibit.all.rules.yml | 60 ------------------- ... inhibit.aws.management-cluster.rules.yml} | 17 +++--- .../alerting-rules/inhibit.kiam.rules.yml | 25 ++++++++ .../cluster-autoscaler.rules.yml | 15 +++-- .../alerting-rules/inhibit.capi.rules.yml | 30 ++++++++++ .../alerting-rules/inhibit.kubelet.rules.yml | 22 +++++++ .../vertical-pod-autoscaler.rules.yml} | 8 +-- .../alerting-rules/inhibit.oncall.rules.yml | 21 +++++++ .../alerting-rules/network.all.rules.yml | 1 - 10 files changed, 123 insertions(+), 81 deletions(-) delete mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml rename helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/{inhibit.management-cluster.rules.yml => inhibit.aws.management-cluster.rules.yml} (90%) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml rename helm/prometheus-rules/templates/kaas/{phoenix => turtles}/alerting-rules/cluster-autoscaler.rules.yml (78%) create mode 100644 helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml create mode 100644 helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml rename helm/prometheus-rules/templates/kaas/{phoenix/alerting-rules/vpa.all.rules.yml => turtles/alerting-rules/vertical-pod-autoscaler.rules.yml} (91%) create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 69ffe1932..a31386edc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor. - Add `CiliumAPITooSlow`. +### Changed + +- Review phoenix alerts towards Mimir. +- Moves cluster-autoscaler and vpa alerts to turtles. + ### Fixed - Fix cabbage alerts for multi-provider wcs. diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml deleted file mode 100644 index 734d0121c..000000000 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.all.rules.yml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - creationTimestamp: null - labels: - {{- include "labels.common" . | nindent 4 }} - name: inhibit.all.rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: inhibit.all - rules: - - alert: InhibitionOutsideWorkingHours - annotations: - description: '{{`Fires outside working hours.`}}' - expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1) - labels: - area: empowerment - nodes_down: "true" - outside_working_hours: "true" - team: phoenix - topic: monitoring - - alert: InhibitionKubeletDown - expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0 - labels: - kubelet_down: "true" - area: kaas - topic: kubernetes - annotations: - description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}' - # TODO(@team-turtles): fix with real expr - - alert: ScrapeTimeout - annotations: - description: '{{`Never fires (dummy alert).`}}' - expr: vector(0) > 1 - labels: - area: empowerment - scrape_timeout: "true" - team: phoenix - topic: monitoring - {{- if (eq .Values.managementCluster.provider.kind "aws") }} - - alert: InhibitionClusterWithoutWorkerNodes - annotations: - description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' - expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0 - labels: - area: kaas - has_worker_nodes: "false" - team: phoenix - topic: status - - alert: InhibitionKiamErrors - annotations: - description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}' - expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0 - labels: - area: kaas - kiam_has_errors: "true" - team: phoenix - topic: kiam - {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml similarity index 90% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml rename to helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml index 0bfc3fe2e..b29069f6e 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.aws.management-cluster.rules.yml @@ -1,3 +1,5 @@ +## TODO Remove with vintage +# This rule applies to vintage aws management clusters {{- if eq .Values.managementCluster.provider.flavor "vintage" }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -5,14 +7,13 @@ metadata: creationTimestamp: null labels: {{- include "labels.common" . | nindent 4 }} -{{- if not .Values.mimir.enabled }} + # No need for .Values.mimir.enabled condition - will be gone with Vintage cluster_type: "management_cluster" -{{- end }} - name: inhibit.management-cluster.rules + name: inhibit.aws.management-cluster.rules namespace: {{ .Values.namespace }} spec: groups: - - name: inhibit.management-cluster + - name: inhibit.aws.management-cluster rules: - alert: InhibitionClusterStatusCreating annotations: @@ -95,13 +96,13 @@ spec: instance_state_not_running: "true" team: phoenix topic: status - - alert: InhibitionControlPlaneUnhealthy + - alert: InhibitionClusterWithoutWorkerNodes annotations: - description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}' - expr: capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1 + description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' + expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0 labels: area: kaas - cluster_control_plane_unhealthy: "true" + has_worker_nodes: "false" team: phoenix topic: status {{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml new file mode 100644 index 000000000..fe8678e35 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml @@ -0,0 +1,25 @@ +## TODO Remove with vintage +# This rule applies to vintage aws clusters +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: inhibit.kiam.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: inhibit.kiam + rules: + - alert: InhibitionKiamErrors + annotations: + description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}' + expr: increase(kiam_metadata_credential_fetch_errors_total[10m]) > 0 or increase(kiam_metadata_find_role_errors_total[10m]) > 0 or increase(kiam_sts_issuing_errors_total[10m]) > 0 + labels: + area: kaas + kiam_has_errors: "true" + team: phoenix + topic: kiam +{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml similarity index 78% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml index 313950683..c47475cb5 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-autoscaler.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml @@ -1,4 +1,4 @@ -{{- if eq .Values.managementCluster.provider.kind "aws" }} +# This rule applies to all cloud workload clusters apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -18,31 +18,30 @@ spec: annotations: description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}' opsrecipe: cluster-autoscaler-scaling/ - expr: cluster_autoscaler_unneeded_nodes_count > 0 + expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"} > 0 for: 240m labels: - area: managedservices + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" cancel_if_cluster_has_no_workers: "true" severity: page - team: phoenix + team: turtles topic: cluster-autoscaler - alert: ClusterAutoscalerFailedScaling annotations: description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up.`}}' opsrecipe: cluster-autoscaler-scaling/ - expr: increase(cluster_autoscaler_failed_scale_ups_total[5m]) > 1 + expr: increase(cluster_autoscaler_failed_scale_ups_total{cluster_type="workload_cluster", provider=~"aws|capa|capz|eks"}[5m]) > 1 for: 15m labels: - area: managedservices + area: kaas cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: turtles topic: cluster-autoscaler -{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml new file mode 100644 index 000000000..354db1a61 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml @@ -0,0 +1,30 @@ +# This rule applies to all capi management clusters +{{- if eq .Values.managementCluster.provider.flavor "capi" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} +{{- if not .Values.mimir.enabled }} + cluster_type: "management_cluster" +{{- end }} + name: inhibit.capi.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: inhibit.capi + rules: + - alert: InhibitionControlPlaneUnhealthy + annotations: + description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}' + expr: |- + capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="ControlPlaneComponentsHealthy", status="False"} == 1 + or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1 + or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1 + labels: + area: kaas + cluster_control_plane_unhealthy: "true" + team: turtles + topic: status +{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml new file mode 100644 index 000000000..3a2653732 --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml @@ -0,0 +1,22 @@ +# This rule applies to all clusters +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: inhibit.kubelet.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: inhibit.kubelet + rules: + - alert: InhibitionKubeletDown + annotations: + description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}' + expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0 + labels: + kubelet_down: "true" + area: kaas + topic: kubernetes + team: turtles diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml similarity index 91% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml rename to helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml index 23945ff8f..2bb8784d8 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/vpa.all.rules.yml +++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml @@ -3,11 +3,11 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: vpa.all.rules + name: vertical-pod-autoscaler.rules namespace: {{ .Values.namespace }} spec: groups: - - name: vpa + - name: vertical-pod-autoscaler rules: - alert: VpaComponentTooManyRestarts annotations: @@ -27,5 +27,5 @@ spec: cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page - team: phoenix - topic: observability + team: turtles + topic: autoscaling diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml new file mode 100644 index 000000000..02d15d9f1 --- /dev/null +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + name: inhibit.oncall.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: inhibit.oncall + rules: + - alert: InhibitionOutsideWorkingHours + annotations: + description: '{{`Fires outside working hours.`}}' + expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1) + labels: + area: platform + outside_working_hours: "true" + team: atlas + topic: monitoring diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml index fbee4e53a..140673d5c 100644 --- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml +++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/network.all.rules.yml @@ -59,7 +59,6 @@ spec: cancel_if_cluster_with_scaling_nodepools: "true" cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} cancel_if_cluster_has_no_workers: "true" - cancel_if_nodes_down: "true" severity: page team: {{ include "providerTeam" . }} topic: network