From 4da1fd343f5e2c77f9ba6b91b57233634103320b Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Mon, 11 Mar 2024 15:47:46 +0100
Subject: [PATCH] Prepare alerts for mimir migration (#1060)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Prepare alerts for mimir migration

* Reduce grouping to cluster-id only

* Update helm/prometheus-rules/templates/alerting-rules/dex.rules.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* add missing cluster_ids

* add missing cluster_ids again

* add missing cluster_ids again

* Fix KSM and prometheus-agent alerts

* Fix failing tests

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 .../apiserver.management-cluster.rules.yml    |   2 +-
 .../apiserver.workload-cluster.rules.yml      |   4 +-
 .../templates/alerting-rules/calico.rules.yml |   2 +
 .../alerting-rules/cert-manager.rules.yml     |   2 +-
 .../alerting-rules/certificate.all.rules.yml  |   2 +
 .../certificate.management-cluster.rules.yml  |   2 +-
 .../templates/alerting-rules/cilium.rules.yml |   4 +-
 .../alerting-rules/coredns.rules.yml          |  17 +-
 .../templates/alerting-rules/crsync.rules.yml |   2 +-
 .../deployment.management-cluster.rules.yml   |  17 +-
 .../templates/alerting-rules/dex.rules.yml    |   2 +-
 .../dns-operator-azure.rules.yml              |   4 +-
 .../alerting-rules/fairness.rules.yml         |   2 +-
 .../alerting-rules/fluentbit.rules.yml        |   2 +-
 .../templates/alerting-rules/flux.rules.yml   |   2 +-
 .../alerting-rules/inhibit.all.rules.yml      |   4 -
 .../inhibit.management-cluster.rules.yml      |   2 -
 .../kube-state-metrics.rules.yml              | 153 +++++++++++++++++-
 .../alerting-rules/kyverno.all.rules.yml      |   6 +-
 .../alerting-rules/loki.all.rules.yml         |   6 +-
 .../management-cluster.rules.yml              |   4 +-
 .../templates/alerting-rules/mimir.rules.yml  |   2 +-
 .../node.management_cluster.rules.yml         |   4 +-
 .../node.workload_cluster.rules.yml           |  14 +-
 .../alerting-rules/operatorkit.rules.yml      |   4 +-
 .../alerting-rules/prometheus-agent.rules.yml |  62 ++++---
 .../prometheus-operator.rules.yml             |   4 +-
 .../alerting-rules/promtail.rules.yml         |   4 +-
 .../alerting-rules/service-level.rules.yml    |   8 +-
 .../alerting-rules/silence-operator.rules.yml |   2 +-
 .../templates/alerting-rules/sloth.rules.yml  |   2 +-
 .../global/kyverno.all.rules.test.yml         |   9 +-
 .../providers/global/loki.all.rules.test.yml  |   3 +-
 .../providers/global/mimir.rules.test.yml     |   1 +
 .../global/prometheus-agent.rules.test.yml    |  96 +++++++++++
 .../providers/global/promtail.rules.test.yml  |   5 +
 .../providers/global/sloth.rules.test.yml     |   1 +
 37 files changed, 369 insertions(+), 93 deletions(-)

diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml
index 8951cc721..f60a3f8d7 100644
--- a/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.management-cluster.rules.yml
@@ -20,7 +20,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
         opsrecipe: apiserver-overloaded/
-      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
+      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="management_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
       for: 1h
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
index edd96b89a..d4b00066b 100644
--- a/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/apiserver.workload-cluster.rules.yml
@@ -19,7 +19,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.verb }} request latency is too high.`}}'
         opsrecipe: apiserver-overloaded/
-      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (verb, le)) > 1
+      expr: histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{cluster_type="workload_cluster", verb=~"CONNECT|DELETE|GET|PATCH|POST|PUT"}[1h])) by (cluster_id, verb, le)) > 1
       for: 1h
       labels:
         area: kaas
@@ -35,7 +35,7 @@ spec:
       annotations:
         description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}'
         opsrecipe: apiserver-admission-webhook-errors/
-      expr: rate(apiserver_admission_webhook_rejection_count{error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
+      expr: rate(apiserver_admission_webhook_rejection_count{cluster_type="workload_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]) > 0
       for: 5m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml b/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml
index a96838e9b..9adbc20e6 100644
--- a/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/calico.rules.yml
@@ -1,3 +1,4 @@
+{{- if eq .Values.managementCluster.provider.flavor "vintage" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -37,3 +38,4 @@ spec:
         severity: notify
         team: {{ include "providerTeam" . }}
         topic: kubernetes
+{{- end }}
diff --git a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml
index f7ab66b53..7404af6b5 100644
--- a/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/cert-manager.rules.yml
@@ -17,7 +17,7 @@ spec:
           If memory usage value is equal to memory limit value then it is likely the pod will be evicted.
           If no limits are set then the pod will burst.
           `}}
-      expr: sum by (pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager)"}) / sum by (pod, namespace, container) (kube_pod_container_resource_requests_memory_bytes{container=~"(cert-manager)"}) >= 0.85
+      expr: sum by (cluster_id, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / sum by (cluster_id, pod, namespace, container) (kube_pod_container_resource_requests{resource="memory", unit="byte",container=~"(cert-manager|cert-manager-app-controller)"}) >= 0.85
       for: 10m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml
index c1c497f09..a04fda923 100644
--- a/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/certificate.all.rules.yml
@@ -10,6 +10,7 @@ spec:
   groups:
   - name: certificate.all
     rules:
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     - alert: KiamCertificateSecretWillExpireInLessThanTwoWeeks
       annotations:
         description: '{{`Kiam Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
@@ -22,6 +23,7 @@ spec:
         severity: page
         team: phoenix
         topic: cert-manager
+    {{- end }}
     - alert: IRSACertificateSecretWillExpireInLessThanTwoWeeks
       annotations:
         description: '{{`IRSA Pod Identity Webhook Certificate stored in Secret {{ $labels.namespace }}/{{ $labels.name }} on {{ $labels.cluster_id }} will expire in less than two weeks.`}}'
diff --git a/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml
index b37457eab..c83fb2e78 100644
--- a/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/certificate.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Cannot renew Certificate for Secret {{ $labels.namespace }}/{{ $labels.certificatename }} because it is missing.`}}'
         opsrecipe: managed-app-cert-manager/missing-certificate-for-secret/
-      expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (certificatename,namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (certificatename,namespace)
+      expr: count(cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename!=""}) by (cluster_id, certificatename, namespace) unless count(label_replace(cert_exporter_certificate_cr_not_after{cluster_type="management_cluster"}, "certificatename", "$1", "name", "(.*)")) by (cluster_id, certificatename,namespace)
       for: 5m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml
index 6c1ef0a75..49aced72d 100644
--- a/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/cilium.rules.yml
@@ -14,7 +14,7 @@ spec:
       annotations:
         description: '{{`Cilium BPF map is about to fill up.`}}'
         opsrecipe: cilium-bpf-map/
-      expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 80
+      expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 80
       for: 15m
       labels:
         area: kaas
@@ -26,7 +26,7 @@ spec:
       annotations:
         description: '{{`Cilium BPF map is about filled up.`}}'
         opsrecipe: cilium-bpf-map/
-      expr: avg(cilium_bpf_map_pressure) by (map_name) * 100 > 95
+      expr: avg(cilium_bpf_map_pressure) by (cluster_id, map_name) * 100 > 95
       for: 15m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml
index af5454208..e1ff1cdd2 100644
--- a/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/coredns.rules.yml
@@ -14,7 +14,8 @@ spec:
       annotations:
         description: '{{`CoreDNS Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: core-dns-deployment-not-satisfied/
-      expr: sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}))* 100 < 51
+      expr: |
+        sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) / (sum(kube_deployment_status_replicas_available{deployment=~"coredns.*"}) by (cluster_id) + sum(kube_deployment_status_replicas_unavailable{deployment=~"coredns.*"}) by (cluster_id))* 100 < 51
       for: 10m
       labels:
         area: empowerment
@@ -25,7 +26,19 @@ spec:
         team: cabbage
         topic: dns
     - alert: CoreDNSMaxHPAReplicasReached
-      expr: kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"} AND kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
+      expr: |
+        (
+          # This is using the deprecated HPA metric names https://github.com/kubernetes/kube-state-metrics/commit/eb01334f2d03ebc3ab25cd7b29d0ff28f6ca5ee0
+          # TODO(@team-cabbage) remove once kube-state-metrics is updated to use the new metric names everywhere
+          kube_hpa_status_current_replicas{hpa="coredns"} == kube_hpa_spec_max_replicas{hpa="coredns"}
+          and
+          kube_hpa_spec_min_replicas{hpa="coredns"} != kube_hpa_spec_max_replicas{hpa="coredns"}
+        ) or (
+          # This is using the new HPA metric names
+          kube_horizontalpodautoscaler_status_current_replicas{horizontalpodautoscaler="coredns"} == kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
+          and
+          kube_horizontalpodautoscaler_spec_min_replicas{horizontalpodautoscaler="coredns"} != kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="coredns"}
+        )
       for: 120m
       labels:
         area: empowerment
diff --git a/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml b/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml
index f4995ef68..d5be1b1e6 100644
--- a/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/crsync.rules.yml
@@ -32,7 +32,7 @@ spec:
       annotations:
         description: '{{`Too many tags are not synchronised to registry mirrors.`}}'
         opsrecipe: crsync-too-many-tags-missing/
-      expr: crsync_sync_tags_total{registry="quay.io"} - on (repository,app) group_left sum by(repository,app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
+      expr: crsync_sync_tags_total{registry="quay.io"} - on (cluster_id, repository, app) group_left sum by(cluster_id, repository, app) (crsync_sync_tags_total{registry!="quay.io"}) > 0
       for: 1h
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml
index 3e518f828..b2df1ee3e 100644
--- a/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/deployment.management-cluster.rules.yml
@@ -17,22 +17,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id!~"argali|giraffe"} > 0
-      for: 30m
-      labels:
-        area: kaas
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
-        severity: page
-        team: atlas
-        topic: managementcluster
-    - alert: DeploymentNotSatisfiedChinaAtlas
-      annotations:
-        description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
-        opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*", cluster_id=~"argali|giraffe"} > 0
+      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*"} > 0
       for: 30m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml
index db8014999..03e224a89 100644
--- a/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/dex.rules.yml
@@ -29,7 +29,7 @@ spec:
       annotations:
         description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}'
         opsrecipe: dex-operator/
-      expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
+      expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster"}) - time() < 60*60*12
       for: 30m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml b/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml
index 58e15632c..2e8db77c7 100644
--- a/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/dns-operator-azure.rules.yml
@@ -16,7 +16,7 @@ spec:
             opsrecipe: dns-operator-azure/
           expr: |-
             capi_cluster_status_phase{phase="Provisioned"}
-            unless on (name)
+            unless on (cluster_id, name)
             label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)")
           for: 30m
           labels:
@@ -31,7 +31,7 @@ spec:
               {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}}
             opsrecipe: dns-operator-azure/
           expr: |-
-            sum by (method,installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
+            sum by (cluster_id, method, installation) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
           for: 15m
           labels:
             area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
index 4200addff..1b29fb629 100644
--- a/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/fairness.rules.yml
@@ -25,7 +25,7 @@ spec:
       annotations:
         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
         opsrecipe: flowcontrol-rejected-requests/
-      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (priority_level) > (min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(priority_level) (apiserver_flowcontrol_request_concurrency_limit))
+      expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, priority_level) > (min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit_overwrite) OR min by(cluster_id, priority_level) (apiserver_flowcontrol_request_concurrency_limit))
       for: 15m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
index c3cdbc932..93b9d51d8 100644
--- a/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/fluentbit.rules.yml
@@ -46,7 +46,7 @@ spec:
       annotations:
         description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
         opsrecipe: fluentbit-down/
-      expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, cluster_type, installation, job, namespace, provider, node) == 0
+      expr: sum(up{app="fluent-logshipping-app"}) by (app, cluster_id, job, namespace, node) == 0
       for: 15m
       labels:
         area: empowerment
diff --git a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml
index 2873e7df4..d972b5a06 100644
--- a/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/flux.rules.yml
@@ -107,7 +107,7 @@ spec:
         cancel_if_outside_working_hours: "true"
         team: honeybadger
         topic: releng
-# this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
+    # this alert checks average reconciliation times in 10 min windows, then calculates monthly error budget usage for it
     - alert: FluxReconciliationLongErrorBudgetLow
       annotations:
         description: |-
diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
index 1c6fdf3a0..318ae2c44 100644
--- a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
@@ -52,16 +52,12 @@ spec:
     - alert: InhibitionClusterWithoutWorkerNodes
       annotations:
         description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
-      {{-  if eq .Values.managementCluster.provider.kind "aws" }}
       expr: sum(aws_operator_asg_desired_count) by (cluster_id) - on(cluster_id) sum(aws_operator_asg_desired_count{asg=~".*-tccpn-.*"}) by (cluster_id) == 0
-      {{- end }}
       labels:
         area: kaas
         has_worker_nodes: "false"
         team: phoenix
         topic: status
-    {{- end }}
-    {{- if eq .Values.managementCluster.provider.kind "aws" }}
     - alert: InhibitionKiamErrors
       annotations:
         description: '{{`Kiam on cluster {{ $labels.cluster_id }} has increased error rate.`}}'
diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
index 59adfed5e..d23d0f156 100644
--- a/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.management-cluster.rules.yml
@@ -59,7 +59,6 @@ spec:
         cluster_status_deleting: "true"
         team: phoenix
         topic: status
-    {{- if eq .Values.managementCluster.provider.kind "aws" }}
     - alert: InhibitionClusterWithNoNodePools
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} doesn''t have any node pools.`}}'
@@ -96,5 +95,4 @@ spec:
         instance_state_not_running: "true"
         team: phoenix
         topic: status
-    {{- end }}
 {{- end }}
diff --git a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml
index 8b15812f2..88fa3264a 100644
--- a/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/kube-state-metrics.rules.yml
@@ -62,7 +62,7 @@ spec:
         opsrecipe: kube-state-metrics-down/
       expr: |-
         # When it looks up but we don't have metrics
-        count({app="kube-state-metrics"}) < 10
+        count({app="kube-state-metrics"}) by (cluster_id) < 10
       for: 20m
       labels:
         area: kaas
@@ -77,11 +77,27 @@ spec:
         severity: page
         team: atlas
         topic: observability
+
     - alert: KubeConfigMapCreatedMetricMissing
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_configmap_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_configmap_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -94,7 +110,22 @@ spec:
       annotations:
         description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_daemonset_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_daemonset_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -107,7 +138,22 @@ spec:
       annotations:
         description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_deployment_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_deployment_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -120,7 +166,22 @@ spec:
       annotations:
         description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_endpoint_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_endpoint_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -133,7 +194,22 @@ spec:
       annotations:
         description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_namespace_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_namespace_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -146,7 +222,22 @@ spec:
       annotations:
         description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_node_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_node_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -159,7 +250,22 @@ spec:
       annotations:
         description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_pod_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_pod_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -172,7 +278,22 @@ spec:
       annotations:
         description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_replicaset_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_replicaset_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -185,7 +306,22 @@ spec:
       annotations:
         description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_secret_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_secret_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
@@ -198,7 +334,22 @@ spec:
       annotations:
         description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
+      {{- if not .Values.mimir.enabled }}
       expr: absent(kube_service_created{})
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(kube_secret_created{}) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 30m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml
index 9cfab2cbb..7557182c4 100644
--- a/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/kyverno.all.rules.yml
@@ -14,7 +14,7 @@ spec:
       annotations:
         description: 'Kyverno has no available replicas but webhooks are present.'
         opsrecipe: kyverno-webhooks/
-      expr: sum(kube_validatingwebhookconfiguration_info{validatingwebhookconfiguration=~"kyverno-.*"}) > 0 and sum(kube_deployment_status_replicas{deployment=~"kyverno|kyverno-admission-controller"}) == 0
+      expr: sum(kube_validatingwebhookconfiguration_info{validatingwebhookconfiguration=~"kyverno-.*"}) by (cluster_id) > 0 and sum(kube_deployment_status_replicas{deployment=~"kyverno|kyverno-admission-controller"}) by (cluster_id) == 0
       for: 15m
       labels:
         area: managedservices
@@ -63,7 +63,7 @@ spec:
       annotations:
         description: 'Kyverno has been scaled down for too long.'
         opsrecipe: kyverno-scaled-down/
-      expr: sum(kube_deployment_spec_replicas{deployment=~"kyverno|kyverno-kyverno-plugin|kyverno-policy-reporter"}) == 0
+      expr: sum(kube_deployment_spec_replicas{deployment=~"kyverno|kyverno-kyverno-plugin|kyverno-policy-reporter"}) by (cluster_id) == 0
       for: 4h
       labels:
         area: managedservices
@@ -78,7 +78,7 @@ spec:
       annotations:
         description: "Kyverno's admission controller deployment must use at least 3 replicas, or be scaled to 0."
         opsrecipe: KyvernoWronglyScaled/
-      expr: sum(kube_deployment_spec_replicas{deployment="kyverno"}) != 0 and sum(kube_deployment_spec_replicas{deployment="kyverno"}) < 3
+      expr: sum(kube_deployment_spec_replicas{deployment="kyverno"}) by (cluster_id) != 0 and sum(kube_deployment_spec_replicas{deployment="kyverno"}) by (cluster_id) < 3
       for: 1h
       labels:
         area: managedservices
diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml
index 6767fb0d7..9eb724803 100644
--- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml
@@ -15,9 +15,9 @@ spec:
         description: This alert checks that we have less than 10% errors on Loki requests.
         opsrecipe: loki/
       expr: |
-        100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
+        100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster_id, namespace, job, route)
           /
-        sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
+        sum(rate(loki_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route)
           > 10
       for: 120m
       labels:
@@ -36,7 +36,7 @@ spec:
         description: This alert checks that we have no panic errors on Loki.
         opsrecipe: loki/
       expr: |
-        sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
+        sum(increase(loki_panic_total[10m])) by (cluster_id, namespace, job) > 0
       labels:
         area: managedservices
         cancel_if_apiserver_down: "true"
diff --git a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml
index f9903f286..e4bc26f81 100644
--- a/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/management-cluster.rules.yml
@@ -15,7 +15,7 @@ spec:
       annotations:
         description: '{{`Management cluster {{ $labels.cluster_id }} has less than 3 nodes.`}}'
         opsrecipe: management-cluster-less-than-three-workers/
-      expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (node) kube_node_role{role="worker", cluster_type="management_cluster"}) < 3
+      expr: sum(kubelet_node_name{cluster_type="management_cluster"} * on (cluster_id, node) kube_node_role{role="worker", cluster_type="management_cluster"}) by (cluster_id) < 3
       for: 1h
       labels:
         area: kaas
@@ -26,7 +26,7 @@ spec:
     - alert: ManagementClusterMissingNodes
       annotations:
         description: '{{`Management cluster {{ $labels.cluster_id }} has less than 4 minimum nodes.`}}'
-      expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) < 4
+      expr: sum(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}) by (cluster_id) < 4
       for: 15m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
index b22ab1d5c..850221c15 100644
--- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
@@ -12,7 +12,7 @@ spec:
     - alert: MimirComponentDown
       annotations:
         description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
-      expr: count(up{app="mimir"} == 0) by (service) > 0
+      expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0
       for: 5m
       labels:
         area: managedservices
diff --git a/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml
index 104b73d5d..84f43b47b 100644
--- a/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/node.management_cluster.rules.yml
@@ -25,7 +25,7 @@ spec:
       # relabelling 'ip' to 'label_ip' to match against 'kube_node_labels'.
       annotations:
         description: '{{`Node {{ $labels.label_ip }} status is flapping under load.`}}'
-      expr: label_replace(node_load15{cluster_type="management_cluster"} / count(count(node_cpu{cluster_type="management_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels{cluster_type="management_cluster"} and on (ip) changes(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}[30m]) >= 6
+      expr: label_replace(node_load15{cluster_type="management_cluster"} / count(count(node_cpu_seconds_total{cluster_type="management_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels{cluster_type="management_cluster"} and on (ip) changes(kube_node_status_condition{cluster_type="management_cluster", condition="Ready", status="true"}[30m]) >= 6
       for: 10m
       labels:
         area: kaas
@@ -90,7 +90,7 @@ spec:
     - alert: MachineLoadTooHigh
       annotations:
         description: '{{`Machine {{ $labels.instance }} CPU load is too high.`}}'
-      expr: node_load5{cluster_type="management_cluster"} > 2 * count(node_cpu{cluster_type="management_cluster", mode="idle"}) without (cpu,mode)
+      expr: node_load5{cluster_type="management_cluster"} > 2 * count(node_cpu_seconds_total{cluster_type="management_cluster", mode="idle"}) without (cpu,mode)
       for: 3m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml
index aa3dd0d9d..949c00ece 100644
--- a/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/node.workload_cluster.rules.yml
@@ -27,6 +27,7 @@ spec:
         severity: notify
         team: {{ include "providerTeam" . }}
         topic: kubernetes
+    {{- if eq .Values.managementCluster.provider.kind "aws" }}
     - alert: AWSWorkloadClusterNodeTooManyAutoTermination
       annotations:
         description: '{{`Cluster {{ $labels.cluster_id }} has too many nodes terminated by node auto termination feature in a short time.`}}'
@@ -41,6 +42,7 @@ spec:
         severity: page
         team: phoenix
         topic: kubernetes
+    {{- end }}
     - alert: NodeStateFlappingUnderLoad
       # Check if the kubelet status is flapping, unless the node is under load.
       # It helps to read this rule from the bottom upwards.
@@ -53,7 +55,7 @@ spec:
       # relabelling 'ip' to 'label_ip' to match against 'kube_node_labels'.
       annotations:
         description: '{{`Node {{ $labels.label_ip }} status is flapping under load.`}}'
-      expr: label_replace( node_load15 / count(count(node_cpu) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (label_ip) kube_node_labels and on (ip) changes(kube_node_status_condition{condition="Ready", status="true"}[30m]) >= 6
+      expr: label_replace(node_load15{cluster_type="workload_cluster"} / count(count(node_cpu_seconds_total{cluster_type="workload_cluster"}) without (mode)) without (cpu) >= 2, "label_ip", "$1", "ip", "(.*)" ) unless on (cluster_id, label_ip) kube_node_labels{cluster_type="workload_cluster"} and on (cluster_id, ip) changes(kube_node_status_condition{cluster_type="workload_cluster", condition="Ready", status="true"}[30m]) >= 6
       for: 10m
       labels:
         area: kaas
@@ -68,7 +70,7 @@ spec:
       # in the last hour.
       annotations:
         description: '{{`Node {{ $labels.ip }} has constant OOM kills.`}}'
-      expr: kube_pod_container_status_restarts_total{namespace=~"(giantswarm|kube-system)"} - kube_pod_container_status_restarts_total offset 1h >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{reason='OOMKilled'} > 0
+      expr: kube_pod_container_status_restarts_total{cluster_type="workload_cluster", namespace=~"(giantswarm|kube-system)"} - kube_pod_container_status_restarts_total{cluster_type="workload_cluster"} offset 1h >= 1 AND ignoring(reason) kube_pod_container_status_last_terminated_reason{cluster_type="workload_cluster", reason="OOMKilled"} > 0
       for: 10m
       labels:
         area: kaas
@@ -82,7 +84,7 @@ spec:
       annotations:
         description: '{{`Node {{ $labels.node }} reports a connection usage above 85% for the last 15 minutes.`}}'
         opsrecipe: node-conntrack-limits/
-      expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit >= 0.85
+      expr: node_nf_conntrack_entries{cluster_type="workload_cluster"} / node_nf_conntrack_entries_limit{cluster_type="workload_cluster"} >= 0.85
       for: 15m
       labels:
         area: kaas
@@ -94,7 +96,7 @@ spec:
       annotations:
         description: '{{`Machine {{ $labels.instance }} entropy is too low.`}}'
         opsrecipe: low-entropy/
-      expr: node_entropy_available_bits < 250
+      expr: node_entropy_available_bits{cluster_type="workload_cluster"} < 250
       for: 10m
       labels:
         area: kaas
@@ -106,7 +108,7 @@ spec:
       annotations:
         description: '{{`Machine {{ $labels.instance }} has too many allocated file descriptors.`}}'
         opsrecipe: high-number-file-descriptors/
-      expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
+      expr: node_filefd_allocated{cluster_type="workload_cluster"} / node_filefd_maximum{cluster_type="workload_cluster"} * 100 > 80
       for: 15m
       labels:
         area: kaas
@@ -144,7 +146,7 @@ spec:
         ( node_memory_MemFree_bytes{cluster_type="workload_cluster"}
           + node_memory_Cached_bytes{cluster_type="workload_cluster"}
         ) < 2147483648) 
-        and on (node) kube_node_role{role=~"control-plane|master"}
+        and on (cluster_id, node) kube_node_role{cluster_type="workload_cluster", role=~"control-plane|master"}
       for: 60m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml b/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml
index 09f73a963..d7f02f364 100644
--- a/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/operatorkit.rules.yml
@@ -77,7 +77,7 @@ spec:
       annotations:
         description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}'
         opsrecipe: operator-not-reconciling/
-      expr: (sum by (instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp))
+      expr: (sum by (cluster_id, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"aws-operator.+|cluster-operator.+"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp))
       for: 20m
       labels:
         area: kaas
@@ -105,7 +105,7 @@ spec:
       annotations:
         description: '{{`{{ $labels.namespace }}/{{ $labels.app }}@{{ $labels.app_version }} has stopped the reconciliation. Please check logs.`}}'
         opsrecipe: operator-not-reconciling/
-      expr: (sum by (instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"node-operator"}[10m])) == 0 and on (instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp))
+      expr: (sum by (cluster_id, instance, app, app_version, namespace)(increase(operatorkit_controller_event_count{app=~"node-operator"}[10m])) == 0 and on (cluster_id, instance) (operatorkit_controller_deletion_timestamp or operatorkit_controller_creation_timestamp))
       for: 20m
       labels:
         area: kaas
diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml
index 3f6cf73c5..256711de2 100644
--- a/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-agent.rules.yml
@@ -17,7 +17,7 @@ spec:
         summary: Prometheus agent fails to send samples to remote write endpoint.
         opsrecipe: prometheus-agent/
         dashboard: promRW001/prometheus-remote-write
-      #  expr: count(absent_over_time(up{instance="prometheus-agent"}[10m]))
+      {{- if not .Values.mimir.enabled }}
       expr: |-
         max_over_time(
           sum by (cluster_type, cluster_id, installation, instance, service)
@@ -27,6 +27,20 @@ spec:
             absent(up{instance="prometheus-agent"}) == 1
           )[5m:]
         )
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 20m
       labels:
         area: empowerment
@@ -44,7 +58,7 @@ spec:
         summary: Prometheus agent fails to send samples to remote write endpoint.
         opsrecipe: prometheus-agent/
         dashboard: promRW001/prometheus-remote-write
-      #  expr: count(absent_over_time(up{instance="prometheus-agent"}[10m]))
+      {{- if not .Values.mimir.enabled }}
       expr: |-
         max_over_time(
           sum by (cluster_type, cluster_id, installation, instance, service)
@@ -54,6 +68,20 @@ spec:
             absent(up{instance="prometheus-agent"}) == 1
           )[5m:]
         )
+      {{- else }}
+      expr: |-
+        (
+          label_replace(
+            capi_cluster_status_condition{type="Ready", status="True"},
+            "cluster_id",
+            "$1",
+            "name",
+            "(.*)"
+          ) == 1
+        ) unless on (cluster_id, customer, installation, pipeline, provider, region) (
+          count(up{job="prometheus-agent"} > 0) by (cluster_id, customer, installation, pipeline, provider, region)
+        )
+      {{- end }}
       for: 1m
       labels:
         area: empowerment
@@ -71,23 +99,18 @@ spec:
         summary: Prometheus agent is missing shards.
         opsrecipe: prometheus-agent/
       expr: |-
-        max_over_time(sum(
+        max_over_time(sum by (cluster_id)(
           count(
             ## number of remotes that are not mimir or grafana-cloud
             prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-          )
+          ) by (cluster_id)
           !=
           sum(
             ## number of shards defined in the Prometheus CR
             prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-            or
-            (
-              # if there is only 1 shard, there is no shard metric so we use the replicas metric
-              absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"})
-              and on(controller, name)
-              prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-            )
-          )
+            # if there is only 1 shard, there is no shard metric so we use the replicas metric
+            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+          ) by (cluster_id)
         )[5m:])
       for: 20m
       labels:
@@ -107,23 +130,18 @@ spec:
         summary: Prometheus agent is missing shards.
         opsrecipe: prometheus-agent/
       expr: |-
-        max_over_time(sum(
+        max_over_time(sum by (cluster_id)(
           count(
             ## number of remotes that are not mimir or grafana-cloud
             prometheus_remote_storage_metadata_total{remote_name!~"grafana-cloud|mimir.*"}
-          )
+          ) by (cluster_id)
           !=
           sum(
             ## number of shards defined in the Prometheus CR
             prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"}
-            or
-            (
-              # if there is only 1 shard, there is no shard metric so we use the replicas metric
-              absent(prometheus_operator_spec_shards{controller="prometheus",name="prometheus-agent"})
-              and on(controller, name)
-              prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
-            )
-          )
+            # if there is only 1 shard, there is no shard metric so we use the replicas metric
+            or prometheus_operator_spec_replicas{controller="prometheus",name="prometheus-agent"}
+          ) by (cluster_id)
         )[5m:])
       for: 1m
       labels:
diff --git a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
index 90f6e4359..842d5aac0 100644
--- a/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/prometheus-operator.rules.yml
@@ -33,7 +33,7 @@ spec:
     - alert: PrometheusOperatorListErrors
       annotations:
         description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
-      expr: (sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
+      expr: (sum by (cluster_id, controller, namespace) (rate(prometheus_operator_list_operations_failed_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m])) / sum by (cluster_id, controller,namespace) (rate(prometheus_operator_list_operations_total{app=~"prometheus-operator.*|kube-prometheus-.*"}[10m]))) > 0.4
       for: 15m
       labels:
         area: empowerment
@@ -88,7 +88,7 @@ spec:
     - alert: PrometheusOperatorNotReady
       annotations:
         description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
-      expr: min by(cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
+      expr: min by (cluster_id, namespace, controller) (max_over_time(prometheus_operator_ready{app=~"prometheus-operator.*|kube-prometheus-.*"}[5m]) == 0)
       for: 5m
       labels:
         area: empowerment
diff --git a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml
index 63b6d2097..f11abe93b 100644
--- a/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/promtail.rules.yml
@@ -13,7 +13,7 @@ spec:
           annotations:
             description: '{{`Scraping of all promtail pods to check if one failed every 5 minutes.`}}'
             opsrecipe: promtail-is-not-running/
-          expr: count(up{container="promtail"} == 0) > 0
+          expr: count(up{container="promtail"} == 0) by (cluster_id) > 0
           for: 5m
           labels:
             area: "empowerment"
@@ -29,7 +29,7 @@ spec:
             description: This alert checks if that the amount of failed requests is below 10% for promtail
             opsrecipe: promtail-requests-are-failing/
           expr: |
-            100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10
+            100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (cluster_id, namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (cluster_id, namespace, job, route, instance) > 10
           for: 15m
           labels:
             area: "empowerment"
diff --git a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml
index 3e5e5c195..e422c0c2c 100644
--- a/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/service-level.rules.yml
@@ -19,21 +19,21 @@ spec:
         label_replace(
             (
                   slo_errors_per_request:ratio_rate1h{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"}
-                > on (service) group_left ()
+                > on (cluster_id, service) group_left ()
                   slo_threshold_high
               and
                   slo_errors_per_request:ratio_rate5m{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"}
-                > on (service) group_left ()
+                > on (cluster_id, service) group_left ()
                   slo_threshold_high
             )
           or
             (
                   slo_errors_per_request:ratio_rate6h{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"}
-                > on (service) group_left ()
+                > on (cluster_id, service) group_left ()
                   slo_threshold_low
               and
                   slo_errors_per_request:ratio_rate30m{service!~"efk-.*|.*external-dns.*|kong-.*|.*(ingress-nginx|nginx-ingress-controller).*"}
-                > on (service) group_left ()
+                > on (cluster_id, service) group_left ()
                   slo_threshold_low
             ),
           "team",
diff --git a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
index ce09c1305..6756a633f 100644
--- a/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/silence-operator.rules.yml
@@ -30,7 +30,7 @@ spec:
       # This alert triggers when the silence operator sync job did not schedule for more than 1 day
       # or if the job did not run successfully at least once in the last day
       expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400
-            or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) == 0
+            or count(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync.+", cluster_type="management_cluster"}[1d]) == 1) by (cluster_id) == 0
       labels:
         area: empowerment
         severity: page
diff --git a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
index 0daee50a3..93a5a1257 100644
--- a/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/sloth.rules.yml
@@ -12,7 +12,7 @@ spec:
     - alert: SlothDown
       annotations:
         description: 'Sloth is down.'
-      expr: count(up{app="sloth"} == 0) > 0
+      expr: count(up{app="sloth"} == 0) by (cluster_id) > 0
       for: 5m
       labels:
         area: managedservices
diff --git a/test/tests/providers/global/kyverno.all.rules.test.yml b/test/tests/providers/global/kyverno.all.rules.test.yml
index 6b89a5071..b5044d4a2 100644
--- a/test/tests/providers/global/kyverno.all.rules.test.yml
+++ b/test/tests/providers/global/kyverno.all.rules.test.yml
@@ -18,10 +18,10 @@ tests:
       - series: 'kube_deployment_spec_replicas{app="kube-state-metrics", cluster_id="gremlin", cluster_type="management_cluster", container="kube-state-metrics", customer="giantswarm", deployment="kyverno", endpoint="http", installation="gremlin", instance="10.0.135.241:8080", job="kube-state-metrics", namespace="kyverno", node="master-00000y", organization="giantswarm", pipeline="testing", pod="prometheus-operator-app-kube-state-metrics-d7f4ff68d-qn6sb", prometheus="kube-system/prometheus-agent", prometheus_replica="prometheus-prometheus-agent-0", provider="aws", region="germanywestcentral", service="prometheus-operator-app-kube-state-metrics", service_priority="highest"}'
         values: "0+0x240 1+0x70"
       # Kyverno admission reports
-      - series: 'aggregation:kyverno_resource_counts{kind="admissionreports.kyverno.io"}'
+      - series: 'aggregation:kyverno_resource_counts{cluster_id="gremlin", kind="admissionreports.kyverno.io"}'
         values: "0+1000x30 30000+1500x30"
       # Kyverno updaterequests
-      - series: 'aggregation:kyverno_resource_counts{kind="updaterequests.kyverno.io"}'
+      - series: 'aggregation:kyverno_resource_counts{cluster_id="gremlin", kind="updaterequests.kyverno.io"}'
         values: "0+100x15 5000+1500x30"
     alert_rule_test:
       # Webhooks alert
@@ -30,6 +30,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gremlin
               severity: page
               team: shield
               topic: kyverno
@@ -46,6 +47,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gremlin
               severity: page
               team: shield
               topic: kyverno
@@ -63,6 +65,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gremlin
               severity: notify
               team: shield
               topic: kyverno
@@ -80,6 +83,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gremlin
               severity: notify
               team: shield
               topic: kyverno
@@ -96,6 +100,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gremlin
               severity: notify
               team: shield
               topic: kyverno
diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.all.rules.test.yml
index 3aa69fea1..03bb95fe6 100644
--- a/test/tests/providers/global/loki.all.rules.test.yml
+++ b/test/tests/providers/global/loki.all.rules.test.yml
@@ -14,7 +14,6 @@ tests:
       - series: 'loki_request_duration_seconds_count{app="loki-distributor", cluster_id="zj88t", cluster_type="workload_cluster", container="distributor", customer="giantswarm", installation="gorilla", instance="10.7.75.90:3100", job="zj88t-prometheus/workload-zj88t/0", method="POST", namespace="loki", node="ip-10-6-2-141.eu-central-1.compute.internal", organization="giantswarm-production", pod="loki-distributor-74b78f5559-tz6zs", provider="aws", route="loki_api_v1_push", service_priority="highest", status_code="503", ws="false"}'
         values: "0+0x20 0+30x160"  # After 20 minutes, we also have 0.5 rq/s failing 
     alert_rule_test:
-
       - alertname: LokiRequestPanics
         eval_time: 15m  # should be OK after 15 minutes
         exp_alerts:
@@ -29,6 +28,7 @@ tests:
               cancel_if_cluster_status_updating: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_scrape_timeout: "true"
+              cluster_id: zj88t
               job: zj88t-prometheus/workload-zj88t/0
               namespace: loki
               severity: page
@@ -55,6 +55,7 @@ tests:
               cancel_if_cluster_status_updating: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_scrape_timeout: "true"
+              cluster_id: zj88t
               job: zj88t-prometheus/workload-zj88t/0
               namespace: loki
               route: loki_api_v1_push
diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml
index 2c4787806..0067276ad 100644
--- a/test/tests/providers/global/mimir.rules.test.yml
+++ b/test/tests/providers/global/mimir.rules.test.yml
@@ -28,5 +28,6 @@ tests:
               cancel_if_cluster_status_updating: "true"
               cancel_if_scrape_timeout: "true"
               cancel_if_outside_working_hours: "true"
+              cluster_id: gauss
             exp_annotations:
               description: "Mimir component : mimir-ingester is down."
diff --git a/test/tests/providers/global/prometheus-agent.rules.test.yml b/test/tests/providers/global/prometheus-agent.rules.test.yml
index 437313d5e..c4602f313 100644
--- a/test/tests/providers/global/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/global/prometheus-agent.rules.test.yml
@@ -114,6 +114,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: empowerment
+              cluster_id: test01
               severity: page
               team: atlas
               topic: observability
@@ -131,6 +132,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: empowerment
+              cluster_id: test01
               severity: none
               team: atlas
               topic: observability
@@ -148,6 +150,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: empowerment
+              cluster_id: test01
               severity: page
               team: atlas
               topic: observability
@@ -165,6 +168,99 @@ tests:
         exp_alerts:
           - exp_labels:
               area: empowerment
+              cluster_id: test01
+              severity: none
+              team: atlas
+              topic: observability
+              inhibit_prometheus_agent_down: "true"
+              cancel_if_cluster_is_not_running_prometheus_agent: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+            exp_annotations:
+              description: "Prometheus agent is missing shards."
+              opsrecipe: "prometheus-agent/"
+              summary: "Prometheus agent is missing shards."
+      - alertname: PrometheusAgentShardsMissing
+        eval_time: 130m
+      - alertname: PrometheusAgentShardsMissingInhibition
+        eval_time: 130m
+  # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
+  - interval: 1m
+    input_series:
+      - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
+        values: "10000+0x180"
+      - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
+        values: "10000+0x180"
+      - series: 'prometheus_remote_storage_metadata_total{app="prometheus", cluster_id="test01", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
+        values: "10000+0x180"
+      - series: 'prometheus_operator_spec_replicas{cluster_id="test01", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}'
+        values: '3+0x60 5+0x60 3+0x60'
+    alert_rule_test:
+      - alertname: PrometheusAgentShardsMissing
+        eval_time: 40m
+      - alertname: PrometheusAgentShardsMissingInhibition
+        eval_time: 40m
+      - alertname: PrometheusAgentShardsMissing
+        eval_time: 100m
+        exp_alerts:
+          - exp_labels:
+              area: empowerment
+              cluster_id: test01
+              severity: page
+              team: atlas
+              topic: observability
+              inhibit_prometheus_agent_down: "true"
+              cancel_if_cluster_is_not_running_prometheus_agent: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+            exp_annotations:
+              description: "Prometheus agent is missing shards."
+              opsrecipe: "prometheus-agent/"
+              summary: "Prometheus agent is missing shards."
+      - alertname: PrometheusAgentShardsMissingInhibition
+        eval_time: 100m
+        exp_alerts:
+          - exp_labels:
+              area: empowerment
+              cluster_id: test01
+              severity: none
+              team: atlas
+              topic: observability
+              inhibit_prometheus_agent_down: "true"
+              cancel_if_cluster_is_not_running_prometheus_agent: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+            exp_annotations:
+              description: "Prometheus agent is missing shards."
+              opsrecipe: "prometheus-agent/"
+              summary: "Prometheus agent is missing shards."
+      - alertname: PrometheusAgentShardsMissing
+        eval_time: 125m
+        exp_alerts:
+          - exp_labels:
+              area: empowerment
+              cluster_id: test01
+              severity: page
+              team: atlas
+              topic: observability
+              inhibit_prometheus_agent_down: "true"
+              cancel_if_cluster_is_not_running_prometheus_agent: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+            exp_annotations:
+              description: "Prometheus agent is missing shards."
+              opsrecipe: "prometheus-agent/"
+              summary: "Prometheus agent is missing shards."
+      - alertname: PrometheusAgentShardsMissingInhibition
+        eval_time: 125m
+        exp_alerts:
+          - exp_labels:
+              area: empowerment
+              cluster_id: test01
               severity: none
               team: atlas
               topic: observability
diff --git a/test/tests/providers/global/promtail.rules.test.yml b/test/tests/providers/global/promtail.rules.test.yml
index 0bf7ca3be..724a4b6a7 100644
--- a/test/tests/providers/global/promtail.rules.test.yml
+++ b/test/tests/providers/global/promtail.rules.test.yml
@@ -26,6 +26,7 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability
@@ -41,6 +42,7 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability
@@ -57,6 +59,7 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability
@@ -87,6 +90,7 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability
@@ -101,6 +105,7 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability
diff --git a/test/tests/providers/global/sloth.rules.test.yml b/test/tests/providers/global/sloth.rules.test.yml
index c3b3f518c..05915b9fb 100644
--- a/test/tests/providers/global/sloth.rules.test.yml
+++ b/test/tests/providers/global/sloth.rules.test.yml
@@ -18,6 +18,7 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
+              cluster_id: gauss
               severity: page
               team: atlas
               topic: observability