Merge branch 'main' into start-reviewing-phoenix-alerts

giantswarm · Jun 6, 2024 · c1999e2 · c1999e2
2 parents ba55ada + e583b80
commit c1999e2
Show file tree

Hide file tree

Showing 29 changed files with 90 additions and 77 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor.
+- Add `CiliumAPITooSlow`.
+
+### Changed
+
+- Review phoenix alerts towards Mimir.
+- Moves cluster-autoscaler and vpa alerts to turtles.
+
+### Fixed
+
+- Fix cabbage alerts for multi-provider wcs.
+
+### Removed
+
+- cleanup: remove scrape timeout inhibition leftovers (documentation and labels)
 
 ## [4.1.2] - 2024-05-31
 

diff --git a/README.md b/README.md
@@ -302,9 +302,9 @@ In order for Alertmanager inhibition to work we need 3 elements:
   - an Inhibition definition mapping source labels to target labels in the alertmanager config file
   - an Alert rule with some target labels
 
-An alert having a target label will be inhibited whenever the condition specified in the target label's name is fulfilled. This is why target labels' names are most of the time prefixed by "cancel_if_" (e.g "cancel_if_scrape_timeout").
+An alert having a target label will be inhibited whenever the condition specified in the target label's name is fulfilled. This is why target labels' names are most of the time prefixed by "cancel_if_" (e.g "cancel_if_outside_working_hours").
 
-An alert with a source label will define the conditions under which the target label is effective. For example, if an alert with the "scrape_timeout" label were to fire, all other alerts having the corresponding target label, i.e "cancel_if_scrape_timeout" would be inhibited.
+An alert with a source label will define the conditions under which the target label is effective. For example, if an alert with the "outside_working_hours" label were to fire, all other alerts having the corresponding target label, i.e "cancel_if_outside_working_hours" would be inhibited.
 
 This is possible thanks to the alertmanager config file stored in the Prometheus-Meta-operator which defines the target/source labels coupling.
 

diff --git a/helm/prometheus-rules/templates/_helpers.tpl b/helm/prometheus-rules/templates/_helpers.tpl
@@ -53,14 +53,6 @@ true
 {{- end -}}
 {{- end -}}
 
-{{- define "isClusterServiceInstalled" -}}
-{{ not (eq .Values.managementCluster.provider.flavor "capi") }}
-{{- end -}}
-
-{{- define "isVaultBeingMonitored" -}}
-{{ not (eq .Values.managementCluster.provider.flavor "capi") }}
-{{- end -}}
-
 {{- define "isBastionBeingMonitored" -}}
 {{ not (eq .Values.managementCluster.provider.flavor "capi") }}
 {{- end -}}

diff --git a/...es/shared/alerting-rules/calico.rules.yml → ...s/phoenix/alerting-rules/calico.rules.yml b/...es/shared/alerting-rules/calico.rules.yml → ...s/phoenix/alerting-rules/calico.rules.yml
@@ -1,3 +1,4 @@
+## TODO Remove with vintage
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
@@ -21,7 +22,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: notify
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: kubernetes
     - alert: CalicoNodeMemoryHighUtilization
       annotations:
@@ -36,6 +37,6 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: notify
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: kubernetes
 {{- end }}
diff --git a/.../alerting-rules/cluster-service.rules.yml → .../alerting-rules/cluster-service.rules.yml b/.../alerting-rules/cluster-service.rules.yml → .../alerting-rules/cluster-service.rules.yml
@@ -1,4 +1,5 @@
-{{- if eq (include "isClusterServiceInstalled" .) "true" }}
+## TODO Remove with vintage
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -23,7 +24,7 @@ spec:
       labels:
         area: storage
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: managementcluster
     {{- if eq .Values.managementCluster.pipeline "testing" }}
     - alert: TestClusterTooOld
@@ -33,5 +34,6 @@ spec:
       for: 5m
       labels:
         severity: notify
+        team: phoenix
     {{- end }}
 {{- end }}
diff --git a/...es/shared/alerting-rules/docker.rules.yml → ...s/phoenix/alerting-rules/docker.rules.yml b/...es/shared/alerting-rules/docker.rules.yml → ...s/phoenix/alerting-rules/docker.rules.yml
@@ -1,3 +1,4 @@
+## TODO Remove with vintage
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 # newer clusters don't use docker anymore
 apiVersion: monitoring.coreos.com/v1
@@ -22,6 +23,6 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: observability
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/inhibit.kiam.rules.yml
@@ -1,3 +1,5 @@
+## TODO Remove with vintage
+# This rule applies to vintage aws clusters
 {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule

diff --git a/...tes/shared/alerting-rules/vault.rules.yml → ...as/phoenix/alerting-rules/vault.rules.yml b/...tes/shared/alerting-rules/vault.rules.yml → ...as/phoenix/alerting-rules/vault.rules.yml
@@ -1,4 +1,5 @@
-{{- if eq (include "isVaultBeingMonitored" .) "true" }}
+## TODO Remove with vintage
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -23,7 +24,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: VaultIsSealed
       annotations:
@@ -35,7 +36,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: ClusterServiceVaultTokenAlmostExpired
       annotations:
@@ -47,7 +48,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: ClusterServiceVaultTokenAlmostExpiredMissing
       annotations:
@@ -60,7 +61,7 @@ spec:
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         cancel_if_prometheus_agent_down: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: CertOperatorVaultTokenAlmostExpired
       annotations:
@@ -72,7 +73,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: CertOperatorVaultTokenAlmostExpiredMissing
       annotations:
@@ -84,7 +85,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: VaultLatestEtcdBackupTooOld
       annotations:
@@ -96,7 +97,7 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
     - alert: VaultLatestEtcdBackupMetricsMissing
       annotations:
@@ -108,7 +109,6 @@ spec:
         area: kaas
         cancel_if_outside_working_hours: "true"
         severity: page
-        team: {{ include "providerTeam" . }}
+        team: phoenix
         topic: vault
-
 {{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.capi.rules.yml
@@ -1,4 +1,5 @@
 # This rule applies to all capi management clusters
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -19,10 +20,11 @@ spec:
         description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
       expr: |-
         capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="ControlPlaneComponentsHealthy", status="False"} == 1
-        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1
-        or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1
+          or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="EtcdClusterHealthy", status="False"} == 1
+          or capi_kubeadmcontrolplane_status_condition{cluster_type="management_cluster", type="Available", status="False"} == 1
       labels:
         area: kaas
         cluster_control_plane_unhealthy: "true"
         team: turtles
         topic: status
+{{- end }}
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.kubelet.rules.yml
@@ -1,3 +1,4 @@
+# This rule applies to all clusters
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -8,15 +9,14 @@ metadata:
   namespace: {{ .Values.namespace  }}
 spec:
   groups:
-  - name: inhibit.all
+  - name: inhibit.kubelet
     rules:
     - alert: InhibitionKubeletDown
+      annotations:
+        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
       expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
       labels:
         kubelet_down: "true"
         area: kaas
         topic: kubernetes
         team: turtles
-      annotations:
-        description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
-
diff --git a/.../prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml b/.../prometheus-rules/templates/kaas/turtles/alerting-rules/vertical-pod-autoscaler.rules.yml
@@ -24,8 +24,7 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: turtles
-        topic: observability
+        topic: autoscaling
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml
@@ -46,7 +46,6 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
@@ -63,7 +62,6 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
@@ -81,7 +79,6 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml
@@ -43,7 +43,6 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_cluster_has_no_workers: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
         severity: page

diff --git a/...ed/alerting-rules/service-level.rules.yml → ...as/alerting-rules/service-level.rules.yml b/...ed/alerting-rules/service-level.rules.yml → ...as/alerting-rules/service-level.rules.yml
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml
@@ -21,7 +21,6 @@ spec:
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas

diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/cilium.rules.yml
@@ -17,7 +17,7 @@ spec:
       expr: avg(cilium_bpf_map_pressure) by (cluster_id, installation, pipeline, provider, map_name) * 100 > 80
       for: 15m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_outside_working_hours: "true"
         severity: page
         team: cabbage
@@ -29,7 +29,19 @@ spec:
       expr: avg(cilium_bpf_map_pressure) by (cluster_id, installation, pipeline, provider, map_name) * 100 > 95
       for: 15m
       labels:
-        area: managedservices
+        area: platform
+        severity: page
+        team: cabbage
+        topic: cilium
+    - alert: CiliumAPITooSlow
+      annotations:
+        description: '{{`Cilium API processing time is >50s pod="{{ $labels.pod }}" node="{{ $labels.node }}" method="{{ $labels.method}}" path="{{ $labels.path }}"`}}'
+        opsrecipe: cilium-performance-issues/#slow-cilium-api
+      expr: avg(rate(cilium_agent_api_process_time_seconds_sum{}[5m])/rate(cilium_agent_api_process_time_seconds_count{}[5m]) > 50) by (cluster_id, node, pod, method, path, installation, pipeline, provider)
+      for: 20m
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
         severity: page
         team: cabbage
         topic: cilium
@@ -42,7 +54,7 @@ spec:
       expr: max(rate(cilium_policy_change_total{outcome=~"fail.*"}[20m]) OR rate(cilium_policy_import_errors_total[20m])) by (cluster_id, installation, pipeline, provider) > 0
       for: 10m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_outside_working_hours: "true"
         severity: page
         team: cabbage

diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/coredns.rules.yml
@@ -18,7 +18,7 @@ spec:
         sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id, deployment, installation, namespace, pipeline, provider) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id, deployment, installation, namespace, pipeline, provider) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}) by (cluster_id, deployment, installation, namespace, pipeline, provider))* 100 < 51
       for: 10m
       labels:
-        area: empowerment
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
@@ -41,7 +41,7 @@ spec:
         )
       for: 120m
       labels:
-        area: empowerment
+        area: platform
         cancel_if_outside_working_hours: "true"
         severity: page
         team: cabbage

diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml
@@ -1,4 +1,3 @@
-{{- if (eq .Values.managementCluster.provider.kind "aws") }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -18,10 +17,10 @@ spec:
       annotations:
         description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access registry (cloud service provider DNS service).`}}'
         opsrecipe: external-dns-cant-access-registry/
-      expr: rate(external_dns_registry_errors_total[2m]) > 0
+      expr: rate(external_dns_registry_errors_total{provider="aws|capa|capz|eks"}[2m]) > 0
       for: 15m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
@@ -33,10 +32,10 @@ spec:
       annotations:
         description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access source (Service or Ingress resource).`}}'
         opsrecipe: external-dns-cant-access-source/
-      expr: rate(external_dns_source_errors_total[2m]) > 0
+      expr: rate(external_dns_source_errors_total{provider="aws|capa|capz|eks"}[2m]) > 0
       for: 15m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
@@ -48,10 +47,10 @@ spec:
       annotations:
         description: '{{`external-dns in namespace {{ $labels.namespace }}) is down.`}}'
         opsrecipe: external-dns-down/
-      expr: label_replace(up{app=~"external-dns-(app|monitoring)"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
+      expr: label_replace(up{app=~"external-dns-(app|monitoring)", provider="aws|capa|capz|eks"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
       for: 15m
       labels:
-        area: managedservices
+        area: platform
         cancel_if_outside_working_hours: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
@@ -60,4 +59,3 @@ spec:
         severity: page
         team: cabbage
         topic: external-dns
-{{- end }}