Review phoenix alerts

Signed-off-by: QuentinBisson <[email protected]>
giantswarm · Jun 12, 2024 · 9eddb45 · 9eddb45
1 parent 21b71f1
commit 9eddb45
Show file tree

Hide file tree

Showing 9 changed files with 139 additions and 72 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,15 +17,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Restrict `grafana-agent-rules` CiliumNetworkPolicy.
 - Update team bigmac rules based on the label changes
-- Move the management cluster certificate alerts into the shared alerts because it is provider independent
 - Reviewed turtles alerts labels.
 - Use `ready` replicas for Kyverno webhooks alert.
 - Sort out shared alert ownership by distributing them all to teams.
 - Review and fix phoenix alerts towards Mimir and multi-provider MCs.
-  - Move cluster-autoscaler and vpa alerts to turtles.
+  - Move core components alerts from phoenix to turtles (`cluster-autoscaler`, `vertical-pod-autoscaler`, `kubelet`, `etcd-kubernetes-resources-count-exporter`, `certificates`)
   - Split the phoenix job alert into 2:
     - Add the aws specific job alerts in the `vintage.aws.management-cluster.rules` file.
-    - Move the rest of job.rules to turtles because it is provider independent
+    - Move the rest of `job.rules` to turtles because it is provider independent
   - Prefix all vintage alerts with `vintage` to facilitate maintenance.
   - Merge `kiam` and `inhibit.kiam` into one file.
   - Support any AWS WC in the aws-load-balancer-controller alerts.

diff --git a/...rometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml b/...rometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
@@ -1,19 +1,20 @@
-{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
-## TODO Remove when all vintage installations are gone
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    # No need for .Values.mimir.enabled condition - will be gone with Vintage
+{{- if not .Values.mimir.enabled }}
     cluster_type: "workload_cluster"
-  name: aws.node.workload-cluster.rules
+{{- end }}
+  name: node.aws.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: aws.node
+  - name: node.aws
     rules:
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
+    ## TODO Remove when all vintage installations are gone
     - alert: AWSWorkloadClusterNodeTooManyAutoTermination
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}'
@@ -28,15 +29,16 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    {{- end }}
     - alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes
       annotations:
         description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}'
         opsrecipe: aws-node-taint-NodeWithImpairedVolumes/
-      expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"}
+      expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"} > 0
       for: 30m
       labels:
         area: kaas
         severity: notify
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: kubernetes
-{{- end }}
+
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
@@ -1,23 +1,25 @@
-{{- if eq .Values.managementCluster.provider.kind "aws" }}
+# This rule applies to vintage aws and capa workload clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    # No need for .Values.mimir.enabled condition - will be gone with Vintage
+    {{- if not .Values.mimir.enabled }}
     cluster_type: "workload_cluster"
+    {{- end }}
   name: aws.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: aws
+  - name: aws.workload-cluster
     rules:
     - alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS
       annotations:
         description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
         opsrecipe: container-is-restarting-too-often/
-      expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-plugin.*|aws-pod-identity-webhook.*|etcd-kubernetes-resources-count-exporter.*"}[1h]),"service","/","namespace","pod") > 10
+      ## TODO Review this list once all vintage installations are gone
+      expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*"}[1h]), "service", "/", "namespace", "pod") > 10
       for: 10m
       labels:
         area: kaas
@@ -29,68 +31,42 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
-    - alert: WorkloadClusterCriticalPodNotRunningAWS
+    - alert: WorkloadClusterPodPendingAWS
       annotations:
-        description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}'
-        opsrecipe: critical-pod-is-not-running/
-      expr: kube_pod_container_status_running{namespace="kube-system",container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system",container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1
-      for: 20m
+        description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
+        opsrecipe: pod-stuck-in-pending/
+      ## TODO Review this list once all vintage installations are gone
+      expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*)", phase="Pending"} == 1
+      for: 15m
       labels:
         area: kaas
+        cancel_if_outside_working_hours: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
         cancel_if_kube_state_metrics_down: "true"
+        cancel_if_cluster_has_no_workers: "true"
         severity: page
         team: phoenix
-        topic: kubernetes
-    - alert: WorkloadClusterControlPlaneNodeMissingAWS
-      annotations:
-        description: '{{`Control plane node is missing.`}}'
-        opsrecipe: master-node-missing/
-      expr: count by (cluster_id) (kubernetes_build_info{app="kubelet"} unless on (node) kube_node_role{role!~"control-plane|master"}) == 0
-      for: 30m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        control_plane_node_down: "true"
-        severity: page
-        team: phoenix
-        topic: kubernetes
-    - alert: WorkloadClusterHAControlPlaneDownForTooLong
-      annotations:
-        description: '{{`Control plane node in HA setup is down for a long time.`}}'
-        opsrecipe: master-node-missing/
-      expr: sum by (cluster_id) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="control-plane"}) == 2 or sum by (cluster_id) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="master"}) == 2
-      for: 30m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_outside_working_hours: "true"
-        control_plane_node_down: "true"
-        severity: page
-        team: phoenix
-        topic: kubernetes
-    - alert: WorkloadClusterPodPendingAWS
+    {{- if eq .Values.managementCluster.provider.kind "aws" }}
+    ## TODO Remove when all vintage installations are gone
+    - alert: WorkloadClusterCriticalPodNotRunningAWS
       annotations:
-        description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
-        opsrecipe: pod-stuck-in-pending/
-      expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|cluster-autoscaler.*|ebs-csi-.*)",phase="Pending"} == 1
-      for: 15m
+        description: '{{`Critical pod {{ $labels.namespace }}/{{ $labels.pod }} is not running.`}}'
+        opsrecipe: critical-pod-is-not-running/
+      expr: kube_pod_container_status_running{namespace="kube-system", container=~"(k8s-api-server|k8s-controller-manager|k8s-scheduler)"} != 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-api-server"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-controller-manager"}), "pod", "$1", "container", "(.+)") == 1 or label_replace(absent(kube_pod_container_status_running{namespace="kube-system", container="k8s-scheduler"}), "pod", "$1", "container", "(.+)") == 1
+      for: 20m
       labels:
         area: kaas
-        cancel_if_outside_working_hours: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
         cancel_if_kube_state_metrics_down: "true"
-        cancel_if_cluster_has_no_workers: "true"
         severity: page
         team: phoenix
+        topic: kubernetes
     - alert: WorkloadClusterAWSCNIIpAlmostExhausted
       annotations:
-        description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availabvility_zone }}.`}}'
+        description: '{{`IPs exhausted for aws-cni subnet {{ $labels.id }} in AZ {{ $labels.availability_zone }}.`}}'
         opsrecipe: aws-ips-exhausted/
       expr: min(aws_operator_subnet_available_ips_percentage{subnet_type="aws-cni"}) by (account, availability_zone, cluster_id, id) < 0.1
       for: 5m
@@ -111,4 +87,4 @@ spec:
         severity: page
         team: phoenix
         topic: workloadcluster
-{{- end }}
+    {{- end }}
diff --git a/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/.../prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
@@ -1,4 +1,5 @@
 {{- if eq .Values.managementCluster.provider.kind "capa" }}
+# This rule applies to capa management clusters only
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -12,13 +13,13 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: capa
+  - name: capa.management-cluster
     rules:
     - alert: ManagementClusterPodPendingCAPA
       annotations:
         description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
         opsrecipe: pod-stuck-in-pending/
-      expr: kube_pod_status_phase{namespace="giantswarm", pod=~"(aws.*|capa.*|irsa-operator.*)",phase="Pending", cluster_type="management_cluster"} == 1
+      expr: kube_pod_status_phase{namespace="giantswarm", provider="capa", pod=~"(aws.*|capa.*|irsa-operator.*)", phase="Pending", cluster_type="management_cluster"} == 1
       for: 15m
       labels:
         area: kaas
@@ -48,11 +49,11 @@ spec:
         description: '{{`Deployment {{ $labels.deployment }} is missing.`}}'
         opsrecipe: management-cluster-deployment-is-missing/
       expr: |
-        absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-resolver-rules-operator", cluster_type="management_cluster"})
-        or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-vpc-operator", cluster_type="management_cluster"})
-        or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-controller-manager", cluster_type="management_cluster"})
-        or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-iam-operator", cluster_type="management_cluster"})
-        or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="irsa-operator", cluster_type="management_cluster"})
+        absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-resolver-rules-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+          or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="aws-vpc-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+          or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-controller-manager", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+          or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="capa-iam-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+          or absent(kube_deployment_status_condition{namespace="giantswarm", condition="Available", deployment="irsa-operator", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
       for: 15m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml
@@ -1,4 +1,3 @@
-{{- if (eq .Values.managementCluster.provider.kind "capz") }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -23,20 +22,19 @@ spec:
             area: kaas
             cancel_if_outside_working_hours: {{include "workingHoursOnly" .}}
             severity: notify
-            team: {{include "providerTeam" .}}
+            team: phoenix
             topic: managementcluster
         - alert: AzureDNSOperatorAPIErrorRate
           annotations:
             description: |-
               {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}}
             opsrecipe: dns-operator-azure/
           expr: |-
-            sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
+            sum by (cluster_id, installation, method, pipeline, provider) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
           for: 15m
           labels:
             area: kaas
             cancel_if_outside_working_hours: {{include "workingHoursOnly" .}}
             severity: notify
-            team: {{include "providerTeam" .}}
+            team: phoenix
             topic: managementcluster
-{{- end }}
diff --git a/...heus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml b/...heus-rules/templates/kaas/phoenix/alerting-rules/vintage.aws.management-cluster.rules.yml
@@ -162,4 +162,17 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+  - name: aws-jobs
+    rules:
+    - alert: JobHasNotBeenScheduledForTooLong
+      annotations:
+        description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}'
+        opsrecipe: job-has-not-been-scheduled-for-too-long/
+      expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200
+      for: 15m
+      labels:
+        area: kaas
+        severity: page
+        team: phoenix
+        topic: managementcluster
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml
@@ -27,6 +27,35 @@ spec:
         severity: notify
         team: {{ include "providerTeam" . }}
         topic: kubernetes
+    - alert: WorkloadClusterControlPlaneNodeMissing
+      annotations:
+        description: '{{`Control plane node is missing.`}}'
+        opsrecipe: master-node-missing/
+      expr: count by (cluster_id, installation, pipeline, provider) (kubernetes_build_info{app="kubelet"} unless on (node) kube_node_role{role!~"control-plane|master"}) == 0
+      for: 30m
+      labels:
+        area: kaas
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        control_plane_node_down: "true"
+        severity: page
+        team: {{ include "providerTeam" . }}
+        topic: kubernetes
+    - alert: WorkloadClusterHAControlPlaneDownForTooLong
+      annotations:
+        description: '{{`Control plane node in HA setup is down for a long time.`}}'
+        opsrecipe: master-node-missing/
+      expr: sum by (cluster_id, installation, pipeline, provider) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="control-plane"}) == 2 or sum by (cluster_id, installation, pipeline, provider) (kubernetes_build_info{app="kubelet"} * on (node) kube_node_role{role="master"}) == 2
+      for: 30m
+      labels:
+        area: kaas
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_outside_working_hours: "true"
+        control_plane_node_down: "true"
+        severity: page
+        team: {{ include "providerTeam" . }}
+        topic: kubernetes
     - alert: NodeStateFlappingUnderLoad
       # Check if the kubelet status is flapping, unless the node is under load.
       # It helps to read this rule from the bottom upwards.

diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml
@@ -0,0 +1,48 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  creationTimestamp: null
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+    {{- if not .Values.mimir.enabled }}
+    cluster_type: "workload_cluster"
+    {{- end }}
+  name: pods.core.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+  - name: pods.core
+    rules:
+    - alert: ContainerIsRestartingTooFrequently
+      annotations:
+        description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
+        opsrecipe: container-is-restarting-too-often/
+      expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"cluster-autoscaler.*|etcd-kubernetes-resources-count-exporter.*"}[1h]), "service", "/", "namespace", "pod") > 10
+      for: 10m
+      labels:
+        area: kaas
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_has_no_workers: "true"
+        severity: page
+        team: {{ include "providerTeam" . }}
+        topic: kubernetes
+    - alert: PodPending
+      annotations:
+        description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
+        opsrecipe: pod-stuck-in-pending/
+      expr: kube_pod_status_phase{namespace="kube-system",pod=~"(cluster-autoscaler.*)",phase="Pending"} == 1
+      for: 15m
+      labels:
+        area: kaas
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+        cancel_if_cluster_status_updating: "true"
+        cancel_if_kube_state_metrics_down: "true"
+        cancel_if_cluster_has_no_workers: "true"
+        severity: page
+        team: {{ include "providerTeam" . }}
+
diff --git a/test/conf/promtool_ignore b/test/conf/promtool_ignore
@@ -42,6 +42,7 @@ kaas/turtles/alerting-rules/net-exporter.rules.yml
 kaas/turtles/alerting-rules/node-exporter.rules.yml
 kaas/turtles/alerting-rules/node.management-cluster.rules.yml
 kaas/turtles/alerting-rules/node.workload-cluster.rules.yml
+kaas/turtles/alerting-rules/pods.core.rules.yml
 kaas/turtles/alerting-rules/release.rules.yml
 kaas/turtles/alerting-rules/storage.management-cluster.rules.yml
 kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml