Prepare alerts for mimir migration (#1060)

* Prepare alerts for mimir migration * Reduce grouping to cluster-id only * Update helm/prometheus-rules/templates/alerting-rules/dex.rules.yml Co-authored-by: Hervé Nicol <[email protected]> * Update helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml Co-authored-by: Hervé Nicol <[email protected]> * add missing cluster_ids * add missing cluster_ids again * add missing cluster_ids again * Fix KSM and prometheus-agent alerts * Fix failing tests --------- Co-authored-by: Hervé Nicol <[email protected]>
giantswarm · Mar 11, 2024 · 4da1fd3 · 4da1fd3
1 parent a81bd4c
commit 4da1fd3
Show file tree

Hide file tree

Showing 37 changed files with 369 additions and 93 deletions.
diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml
@@ -20,7 +20,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
         opsrecipe: apiserver-overloaded/
-      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
+      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
       for: 1h
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
@@ -19,7 +19,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
         opsrecipe: apiserver-overloaded/
-      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
+      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="workload_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
       for: 1h
       labels:
         area: kaas
@@ -35,7 +35,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}'
         opsrecipe: apiserver-admission-webhook-errors/
-      expr: rate(apiserver_admission_webhook_rejection_count{error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
+      expr: rate(apiserver_admission_webhook_rejection_count{cluster_type="workload_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
       for: 5m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml b/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml
@@ -1,3 +1,4 @@
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -37,3 +38,4 @@ spec:
         severity: notify
         team: {{ include "providerTeam" . }}
         topic: kubernetes
+{{- end }}
diff --git a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml
@@ -17,7 +17,7 @@ spec:
           If memory usage value is equal to memory limit value then it is likely the pod will be evicted.
           If no limits are set then the pod will burst.
           `}}
-      expr: sum by (pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager)"}) / sum by (pod, namespace, container) (kube_pod_container_resource_requests_memory_bytes{container=~"(cert-manager)"}) >= 0.85
+      expr: sum by (cluster_id, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / sum by (cluster_id, pod, namespace, container) (kube_pod_container_resource_requests{resource="memory", unit="byte",container=~"(cert-manager|cert-manager-app-controller)"}) >= 0.85
       for: 10m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml
@@ -10,6 +10,7 @@ spec:
   groups:
   - name: certificate.all
     rules:
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     - alert: KiamCertificateSecretWillExpireInLessThanTwoWeeks
       annotations:
         description: '{{`Kiam Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
@@ -22,6 +23,7 @@ spec:
         severity: page
         team: phoenix
         topic: cert-manager
+    {{- end }}
     - alert: IRSACertificateSecretWillExpireInLessThanTwoWeeks
       annotations:
         description: '{{`IRSA Pod Identity Webhook Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'

diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}'
         opsrecipe: managed-app-cert-manager/missing-certificate-for-secret/
-      expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (certificatename,namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (certificatename,namespace)
+      expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, certificatename,namespace)
       for: 5m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml
@@ -14,7 +14,7 @@ spec:
       annotations:
         description: '{{`Cilium BPF map is about to fill up.`}}'
         opsrecipe: cilium-bpf-map/
-      expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 80
+      expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 80
       for: 15m
       labels:
         area: kaas
@@ -26,7 +26,7 @@ spec:
       annotations:
         description: '{{`Cilium BPF map is about filled up.`}}'
         opsrecipe: cilium-bpf-map/
-      expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 95
+      expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 95
       for: 15m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml
@@ -14,7 +14,8 @@ spec:
       annotations:
         description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: core-dns-deployment-not-satisfied/
-      expr: sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}))* 100 < 51
+      expr: |
+        sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}) by (cluster_id))* 100 < 51
       for: 10m
       labels:
         area: empowerment
@@ -25,7 +26,19 @@ spec:
         team: cabbage
         topic: dns
     - alert: CoreDNSMaxHPAReplicasReached
-      expr: kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} AND kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
+      expr: |
+        (
+          # This is using the deprecated HPA metric names https://github.com/kubernetes/kube-state-metrics/commit/eb01334f2d03ebc3ab25cd7b29d0ff28f6ca5ee0
+          # TODO(@team-cabbage) remove once kube-state-metrics is updated to use the new metric names everywhere
+          kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"}
+          and
+          kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
+        ) or (
+          # This is using the new HPA metric names
+          kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="coredns"} == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
+          and
+          kube_horizontalpodautoscaler_spec_min_replicas{horizontalpodautoscaler="coredns"} != kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
+        )
       for: 120m
       labels:
         area: empowerment

diff --git a/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml b/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml
@@ -32,7 +32,7 @@ spec:
       annotations:
         description: '{{`Too many tags are not synchronised to registry mirrors.`}}'
         opsrecipe: crsync-too-many-tags-missing/
-      expr: crsync_sync_tags_total{registry="quay.io"} - on (repository,app) group_left sum by(repository,app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
+      expr: crsync_sync_tags_total{registry="quay.io"} - on (cluster_id, repository, app) group_left sum by(cluster_id, repository, app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
       for: 1h
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml
@@ -17,22 +17,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id!~"argali|giraffe"} > 0
-      for: 30m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
-        severity: page
-        team: atlas
-        topic: managementcluster
-    - alert: DeploymentNotSatisfiedChinaAtlas
-      annotations:
-        description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
-        opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id=~"argali|giraffe"} > 0
+      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0
       for: 30m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml
@@ -29,7 +29,7 @@ spec:
       annotations:
         description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}'
         opsrecipe: dex-operator/
-      expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
+      expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
       for: 30m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml
@@ -16,7 +16,7 @@ spec:
             opsrecipe: dns-operator-azure/
           expr: |-
             capi_cluster_status_phase{phase="Provisioned"}
-            unless on (name)
+            unless on (cluster_id, name)
             label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)")
           for: 30m
           labels:
@@ -31,7 +31,7 @@ spec:
               {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}}
             opsrecipe: dns-operator-azure/
           expr: |-
-            sum by (method,installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
+            sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
           for: 15m
           labels:
             area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
@@ -25,7 +25,7 @@ spec:
       annotations:
         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
         opsrecipe: flowcontrol-rejected-requests/
-      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (priority_level) > (min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit))
+      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
       for: 15m
       labels:
         area: kaas

diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
@@ -46,7 +46,7 @@ spec:
       annotations:
         description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
         opsrecipe: fluentbit-down/
-      expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, cluster_type, installation, job, namespace, provider, node) == 0
+      expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0
       for: 15m
       labels:
         area: empowerment

diff --git a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml
@@ -107,7 +107,7 @@ spec:
         cancel_if_outside_working_hours: "true"
         team: honeybadger
         topic: releng
-# this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
+    # this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
     - alert: FluxReconciliationLongErrorBudgetLow
       annotations:
         description: |-

diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
@@ -52,16 +52,12 @@ spec:
     - alert: InhibitionClusterWithoutWorkerNodes
       annotations:
         description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
-      {{-  if eq .Values.managementCluster.provider.kind "aws" }}
       expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
-      {{- end }}
       labels:
         area: kaas
         has_worker_nodes: "false"
         team: phoenix
         topic: status
-    {{- end }}
-    {{- if eq .Values.managementCluster.provider.kind "aws" }}
     - alert: InhibitionKiamErrors
       annotations:
         description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'

diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
@@ -59,7 +59,6 @@ spec:
         cluster_status_deleting: "true"
         team: phoenix
         topic: status
-    {{- if eq .Values.managementCluster.provider.kind "aws" }}
     - alert: InhibitionClusterWithNoNodePools
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} doesn''t have any node pools.`}}'
@@ -96,5 +95,4 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
-    {{- end }}
 {{- end }}