From c919d0b20899a6cb33c53b486909e260fbab328f Mon Sep 17 00:00:00 2001
From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:39:01 +0100
Subject: [PATCH 01/10] fix(deps): update module
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring to
 v0.78.1 (#1416)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
---
 test/hack/checkLabels/go.mod | 2 +-
 test/hack/checkLabels/go.sum | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod
index 24415136..af30ba78 100644
--- a/test/hack/checkLabels/go.mod
+++ b/test/hack/checkLabels/go.mod
@@ -7,7 +7,7 @@ toolchain go1.23.2
 require (
 	// Try to keep version in sync with our prometheus rule CRD version.
 	// see https://github.com/giantswarm/prometheus-operator-crd/blob/master/helm/prometheus-operator-crd/Chart.yaml#L11
-	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0
+	github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1
 	sigs.k8s.io/yaml v1.4.0
 )
 
diff --git a/test/hack/checkLabels/go.sum b/test/hack/checkLabels/go.sum
index 9cca146d..ce45a0cf 100644
--- a/test/hack/checkLabels/go.sum
+++ b/test/hack/checkLabels/go.sum
@@ -557,6 +557,8 @@ github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2 h
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2/go.mod h1:D0KY8md81DQKdaR/cXwnhoWB3MYYyc/UjvqE8GFkIvA=
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0 h1:b2L36QF60oB8Ty97UOCOnN2VnRbT6eaxzYda9kmk9zE=
 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.0/go.mod h1:SvsRXw4m1F2vk7HquU5h475bFpke27mIUswfyw9u3ug=
+github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1 h1:Fm9Z+FabnB+6EoGq15j+pyLmaK6hYrYOpBlTzOLTQ+E=
+github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1/go.mod h1:SvsRXw4m1F2vk7HquU5h475bFpke27mIUswfyw9u3ug=
 github.com/prometheus/alertmanager v0.22.2 h1:JrDZalSEMb2/2bqGAhls6ZnvOxbC5jMIu29JV+uWTC0=
 github.com/prometheus/alertmanager v0.22.2/go.mod h1:rYinOWxFuCnNssc3iOjn2oMTlhLaPcUuqV5yk5JKUAE=
 github.com/prometheus/alertmanager v0.25.0 h1:vbXKUR6PYRiZPRIKfmXaG+dmCKG52RtPL4Btl8hQGvg=

From 15ad23146226cba7dfeac8464ac5b89ce2a7f74f Mon Sep 17 00:00:00 2001
From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 01:15:03 +0100
Subject: [PATCH 02/10] chore(deps): update dependency go to v1.23.3 (#1419)

Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
---
 test/hack/checkLabels/go.mod | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/hack/checkLabels/go.mod b/test/hack/checkLabels/go.mod
index af30ba78..d953a5d8 100644
--- a/test/hack/checkLabels/go.mod
+++ b/test/hack/checkLabels/go.mod
@@ -2,7 +2,7 @@ module checkLabels
 
 go 1.23
 
-toolchain go1.23.2
+toolchain go1.23.3
 
 require (
 	// Try to keep version in sync with our prometheus rule CRD version.

From 6b2b154d445bbd0bb36b3a9b3a8a4f666261e1c5 Mon Sep 17 00:00:00 2001
From: Lukasz Jakimczuk <39192420+ljakimczuk@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:37:20 +0100
Subject: [PATCH 03/10] Update app.rules.yml (#1418)

* Update app.rules.yml

* Fix tests?

* Changelog
---
 CHANGELOG.md                                                   | 1 +
 .../platform/honeybadger/alerting-rules/app.rules.yml          | 3 +++
 .../platform/honeybadger/alerting-rules/app.rules.test.yml     | 1 +
 3 files changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 10e04151..292dd978 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Fixes the statefulset.rules name as it is currently replacing the deployment.rules alerts.
+- Extends AppCR-related alerts with cancelation for CAPI clusters with unavailable control plane.
 
 ## [4.22.0] - 2024-10-29
 
diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
index 33c535c1..fac50490 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
@@ -21,6 +21,7 @@ spec:
       for: 30m
       labels:
         area: platform
+        cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -55,6 +56,7 @@ spec:
       for: 30m
       labels:
         area: platform
+        cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -75,6 +77,7 @@ spec:
       for: 30m
       labels:
         area: platform
+        cancel_if_cluster_control_plane_unhealthy: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
diff --git a/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml b/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml
index 0c97be96..86aeb005 100644
--- a/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml
+++ b/test/tests/providers/global/platform/honeybadger/alerting-rules/app.rules.test.yml
@@ -17,6 +17,7 @@ tests:
               app: cilium
               app_version: 1.11.2
               area: platform
+              cancel_if_cluster_control_plane_unhealthy: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"

From 85a89298a162470c956fd9f0db8e2cbb8cd29520 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 13:06:11 +0100
Subject: [PATCH 04/10] Alloy monitoring (#1410)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add sensible alerts for alloy

* wip - add ongoing alerts

* add dashboard annotation

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml

* Update prometheus.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml

* add missing tests

* change based on ops-recipes

* Update CHANGELOG.md

* Update helm-operations.rules.yml

* Update systemd.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml

* Update test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update alloy.rules.yml

Remove component_path

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

* move and fix logging-agent-down alert tests

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 CHANGELOG.md                                  |  15 ++
 .../atlas/alerting-rules/alloy.rules.yml      |  76 +++++++
 .../deployment.management-cluster.rules.yml   |   4 +-
 .../logging-pipeline.rules.yaml               |  88 ++++++++
 .../alerting-rules/prometheus-agent.rules.yml |   5 +-
 .../atlas/alerting-rules/promtail.rules.yml   |   5 +-
 .../atlas/alerting-rules/alloy.rules.test.yml | 196 ++++++++++++++++++
 .../logging-pipeline.rules.test.yml           | 111 ++++++++++
 8 files changed, 493 insertions(+), 7 deletions(-)
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
 create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
 create mode 100644 test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 292dd978..90ce1614 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Add a set of sensible alerts to monitor alloy.
+  - `AlloySlowComponentEvaluations` and `AlloyUnhealthyComponents` to report about alloy component state.
+  - `LoggingAgentDown` to be alerted when the logging agent is down.
+  - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
+  - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+
+### Changed
+
+- Update `DeploymentNotSatisfiedAtlas` to take into account the following components:
+  - `observability-operator`
+  - `alloy-rules`
+  - `observability-gateway`
+
 ## [4.23.0] - 2024-10-30
 
 ### Changed
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
new file mode 100644
index 00000000..086622fe
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -0,0 +1,76 @@
+# This files describe common alloy alerting rules
+# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: alloy.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+    # List of alerts on the state of the alloy components.
+    # Alerts are coming from https://github.com/grafana/alloy/blob/ed52746567d2469a6a97a592ac5aec807646b327/operations/alloy-mixin/alerts/controller.libsonnet
+    # We added the aggregations and our internal labels.
+    - name: alloy.controller
+      rules:
+        - alert: AlloySlowComponentEvaluations
+          annotations:
+            dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+            description: '{{`Component evaluations are taking too long under job {{ $labels.job }}, component_id {{ $labels.component_id }}.`}}'
+            opsrecipe: alloy/
+            summary: Component evaluations are taking too long.
+          expr: sum by (cluster_id, installation, provider, pipeline, namespace, job, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0
+          for: 15m
+          labels:
+            area: platform
+            severity: notify
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+        - alert: AlloyUnhealthyComponents
+          annotations:
+            dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+            description: '{{`Unhealthy components detected under job {{ $labels.job }}`}}'
+            opsrecipe: alloy/
+            summary: Unhealthy components detected.
+          expr: sum by (cluster_id, installation, provider, pipeline, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+    - name: logging-agent
+      rules:
+        # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
+        - alert: LoggingAgentDown
+          annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
+            description: '{{`Scraping of all logging-agent pods to check if one failed every 30 minutes.`}}'
+            opsrecipe: alloy/
+          expr: |-
+            kube_pod_info{pod=~"alloy-logs.*"}
+            * on(cluster_id, pod)
+              group_left ()
+              up{job="alloy-logs", container="alloy"} == 0
+          for: 30m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_outside_working_hours: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_node_unschedulable: "true"
+            cancel_if_node_not_ready: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
index 6d62a35b..1f98fe45 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
         opsrecipe: deployment-not-satisfied/
-      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alertmanager.*|grafana.*|prometheus.*|promxy.*|mimir.*|loki.*|tempo.*|pyroscope.*|object-storage.*|logging-operator.*|silence-operator.*|sloth.*"} > 0
+      expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", deployment=~"alloy-rules.*|alertmanager.*|grafana.*|logging-operator.*|loki.*|mimir.*|oauth2-proxy.*|object-storage.*|observability-gateway.*|observability-operator.*|prometheus.*|promxy.*|tempo.*|pyroscope.*|silence-operator.*|sloth.*"} > 0
       for: 30m
       labels:
         area: platform
@@ -95,7 +95,7 @@ spec:
         team: phoenix
         topic: managementcluster
     {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
-    ## TODO Remove when all vintage clusters are gone
+    ## TODO(@giantswarm/team-atlas) Remove when all vintage clusters are gone
     - alert: AWSManagementClusterDeploymentScaledDownToZero
       annotations:
         description: '{{`Deployment {{ $labels.namespace}}/{{ $labels.deployment }} on AWS has been scaled down to zero for prolonged period of time.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
new file mode 100644
index 00000000..c45f70f4
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yaml
@@ -0,0 +1,88 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: logging-pipeline.rules
+  namespace: {{ .Values.namespace  }}
+spec:
+  groups:
+    - name: logging-pipeline
+      rules:
+        # Any alloy component that uses the loki.write component can throw such errors.
+        # This includes alloy-logs and the observability-gateway
+        - alert: LogForwardingErrors
+          annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
+            description: '{{`More that 10% of the requests to Loki are failing.`}}'
+            opsrecipe: logging-pipeline/
+          expr: |-
+            (
+              100
+              *
+                (
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) (
+                        rate (
+                          loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:]
+                        )
+                      )
+                    )
+                  /
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance) (
+                        rate (
+                          loki_write_request_duration_seconds_count[5m:]
+                        )
+                      )
+                    )
+                )
+            )
+            > 10
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+        # This alert pages when the loki source api component of the observability gateway is throwing errors
+        - alert: LogReceivingErrors
+          annotations:
+            dashboard: 53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview
+            description: '{{`More that 10% of the loki requests to the observability gateway are failing.`}}'
+            opsrecipe: logging-pipeline/
+          expr: |-
+            (
+              100
+              *
+                (
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) (
+                        rate (
+                          loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push", status_code!~"2.."}[5m:]
+                        )
+                      )
+                    )
+                  /
+                    (
+                      sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, route) (
+                        rate (
+                          loki_source_api_request_duration_seconds_count{route=~"(loki_)?api_v1_push"}[5m:]
+                        )
+                      )
+                    )
+                )
+            )
+            > 10
+          for: 15m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index af4c7d43..b0c8e218 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -1,7 +1,6 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus-agent.rules
@@ -10,7 +9,7 @@ spec:
   groups:
   - name: prometheus-agent
     rules:
-    ## Page Atlas if prometheus agent fails to send samples to MC prometheus.
+    ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
     - alert: PrometheusAgentFailing
       annotations:
         description: '{{`Prometheus agent remote write is failing.`}}'
@@ -93,7 +92,7 @@ spec:
         cancel_if_cluster_is_not_running_monitoring_agent: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
-    ## Page Atlas if prometheus agent is missing shards to send samples to MC prometheus.
+    ## This alert pages if one of the prometheus-agent shard is not running.
     - alert: PrometheusAgentShardsMissing
       annotations:
         description: '{{`Prometheus agent is missing shards.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
index f48d135a..422a9c9b 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/promtail.rules.yml
@@ -9,16 +9,17 @@ spec:
   groups:
     - name: promtail
       rules:
+        # This alert lists the existing promtail pods (to extract the node label and inhibit if the node is not ready)
+        # and join the pods with the not running containers
         - alert: PromtailDown
           annotations:
             description: '{{`Scraping of all promtail pods to check if one failed every 30 minutes.`}}'
             opsrecipe: promtail/
           expr: |-
-            # List promtail pods to be able to get the node label and join with the node status to not alert if the node is not ready
             kube_pod_info{pod=~"promtail.*"}
             * on(cluster_id, pod)
               group_left ()
-              up{container="promtail"} == 0 # List promtail containers that are not running
+              up{container="promtail"} == 0
           for: 30m
           labels:
             area: platform
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
new file mode 100644
index 00000000..5ae0ba2f
--- /dev/null
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -0,0 +1,196 @@
+---
+rule_files:
+  - alloy.rules.yml
+
+tests:
+  # Test AlloySlowComponentEvaluations
+  - interval: 1m
+    input_series:
+      - series: 'alloy_component_evaluation_slow_seconds{cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller", component_id="comp1"}'
+        values: "0+0x10 0+1x50 0x50"
+    alert_rule_test:
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 10m
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 50m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              namespace: default
+              job: alloy-controller
+              component_id: comp1
+              severity: notify
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+              description: "Component evaluations are taking too long under job alloy-controller, component_id comp1."
+              opsrecipe: "alloy/"
+              summary: "Component evaluations are taking too long."
+      - alertname: AlloySlowComponentEvaluations
+        eval_time: 80m
+
+  # Test AlloyUnhealthyComponents
+  - interval: 1m
+    input_series:
+      - series: 'alloy_component_controller_running_components{health_type="unhealthy", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", namespace="default", job="alloy-controller"}'
+        values: "0+0x10 1+0x50 0x50"
+    alert_rule_test:
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 10m
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              namespace: default
+              job: alloy-controller
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: bf9f456aad7108b2c808dbd9973e386f/alloy-controller
+              description: "Unhealthy components detected under job alloy-controller"
+              opsrecipe: "alloy/"
+              summary: "Unhealthy components detected."
+      - alertname: AlloyUnhealthyComponents
+        eval_time: 80m
+
+  # Test LoggingAgentDown
+  - interval: 1m
+    input_series:
+      # For the first 80min: test with 1 pod: none, up, down
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-1xxxx", provider="aws", pipeline="testing"}'
+        values: "_x20 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-1xxxx", node="ip-10-0-5-1.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      # From 80min: test with 2 pods: 1 up and 1 down, 2 up, 2 down.
+      - series: 'up{container="alloy", cluster_id="gauss", cluster_type="management_cluster", installation="gauss", job="alloy-logs", pod="alloy-logs-2xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 1+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-2xxxx", node="ip-10-0-5-2.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+      - series: 'up{container="alloy", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", job="alloy-logs", pod="alloy-logs-3xxxx", provider="aws", pipeline="testing"}'
+        values: "_x80 0+0x40 1+0x20 0+0x40"
+      - series: kube_pod_info{cluster_id="gauss", cluster_type="management_cluster", installation="gauss", pod="alloy-logs-3xxxx", node="ip-10-0-5-3.eu-west-1.compute.internal", provider="aws", pipeline="testing"}
+        values: "1x180"
+    alert_rule_test:
+      - alertname: LoggingAgentDown
+        eval_time: 10m
+      - alertname: LoggingAgentDown
+        eval_time: 30m
+      - alertname: LoggingAgentDown
+        eval_time: 71m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-1.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-1xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      # Tests with 2 pods
+      - alertname: LoggingAgentDown
+        eval_time: 111m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+      - alertname: LoggingAgentDown
+        eval_time: 121m
+      - alertname: LoggingAgentDown
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-2.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-2xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_node_unschedulable: "true"
+              cancel_if_node_not_ready: "true"
+              cluster_id: gauss
+              cluster_type: management_cluster
+              installation: gauss
+              node: ip-10-0-5-3.eu-west-1.compute.internal
+              pipeline: testing
+              pod: alloy-logs-3xxxx
+              provider: aws
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
+              description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
+              opsrecipe: "alloy/"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
new file mode 100644
index 00000000..fccbfa5a
--- /dev/null
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml
@@ -0,0 +1,111 @@
+---
+rule_files:
+  - logging-pipeline.rules.yml
+
+tests:
+  # Test LogForwardingErrors
+  - interval: 1m
+    input_series:
+      # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones
+      - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+0x60   0+50x60      3000+100x60  9000+600x60"
+      - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60"
+    alert_rule_test:
+      - alertname: LogForwardingErrors
+        eval_time: 30m
+      - alertname: LogForwardingErrors
+        eval_time: 90m
+      - alertname: LogForwardingErrors
+        eval_time: 150m
+      - alertname: LogForwardingErrors
+        eval_time: 210m
+      - alertname: LogForwardingErrors
+        eval_time: 270m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the requests to Loki are failing."
+              opsrecipe: "logging-pipeline/"
+      - alertname: LogForwardingErrors
+        eval_time: 330m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the requests to Loki are failing."
+              opsrecipe: "logging-pipeline/"
+  # Test LogReceivingErrors
+  - interval: 1m
+    input_series:
+      # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones
+      - series: 'loki_source_api_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+0x60   0+50x60      3000+100x60  9000+600x60"
+      - series: 'loki_source_api_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", route="api_v1_push", pod="alloy-2j7z7"}'
+        values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60"
+    alert_rule_test:
+      - alertname: LogReceivingErrors
+        eval_time: 30m
+      - alertname: LogReceivingErrors
+        eval_time: 90m
+      - alertname: LogReceivingErrors
+        eval_time: 150m
+      - alertname: LogReceivingErrors
+        eval_time: 210m
+      - alertname: LogReceivingErrors
+        eval_time: 270m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the loki requests to the observability gateway are failing."
+              opsrecipe: "logging-pipeline/"
+      - alertname: LogReceivingErrors
+        eval_time: 330m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "More that 10% of the loki requests to the observability gateway are failing."
+              opsrecipe: "logging-pipeline/"

From 076a26ff74c242f045cbc959844f85f0199b11cc Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 7 Nov 2024 13:12:36 +0100
Subject: [PATCH 05/10] Do some little clean up on grafana cloud and alloy
 alerts before adding the alloy-metrics related alerts (#1415)

* add sensible alerts for alloy

* wip - add ongoing alerts

* add dashboard annotation

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml

* Update prometheus.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml

* add missing tests

* change based on ops-recipes

* Clean up some rules a bit

* Update CHANGELOG.md

* Update helm-operations.rules.yml

* Update systemd.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml

* Update test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml
---
 CHANGELOG.md                                  |  2 ++
 .../atlas/alerting-rules/alloy.rules.yml      | 19 +++++++++++-
 ...rter.rules.yml => grafana-cloud.rules.yml} | 30 +++++++++++++++---
 .../atlas/alerting-rules/grafana.rules.yml    |  4 +--
 .../kube-state-metrics.rules.yml              |  1 -
 .../atlas/alerting-rules/mimir.rules.yml      | 15 ---------
 .../atlas/alerting-rules/prometheus.rules.yml | 18 -----------
 ....rules.test.yml => grafana-cloud.test.yml} |  2 +-
 .../atlas/alerting-rules/mimir.rules.test.yml | 29 -----------------
 .../atlas/alerting-rules/alloy.rules.test.yml | 31 +++++++++++++++++++
 10 files changed, 80 insertions(+), 71 deletions(-)
 rename helm/prometheus-rules/templates/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.yml => grafana-cloud.rules.yml} (74%)
 rename test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/{mimir-to-grafana-cloud-exporter.rules.test.yml => grafana-cloud.test.yml} (99%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90ce1614..1039fd1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-operator`
   - `alloy-rules`
   - `observability-gateway`
+- Move all `grafana-cloud` related alerts to their own file.
+- Move all alloy related alerts to the alloy alert file.
 
 ## [4.23.0] - 2024-10-30
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index 086622fe..e2d8345f 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -48,7 +48,24 @@ spec:
             cancel_if_cluster_status_creating: "true"
             cancel_if_cluster_status_deleting: "true"
             cancel_if_cluster_status_updating: "true"
-    - name: logging-agent
+    - name: alloy.rules
+      rules:
+        - alert: AlloyForPrometheusRulesDown
+          annotations:
+            description: 'Alloy sending PrometheusRules to Loki and Mimir ruler is down.'
+            opsrecipe: prometheus-rules/
+          expr: count(up{job="alloy-rules", namespace="monitoring"} == 0) by (cluster_id, installation, provider, pipeline) > 0
+          for: 1h
+          labels:
+            area: platform
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_status_updating: "true"
+            cancel_if_outside_working_hours: "true"
+            severity: page
+            team: atlas
+            topic: observability
+    - name: alloy.logs
       rules:
         # This alert lists the existing logging-agent pods (to extract the node label and inhibit if the node is not ready)
         # and join the pods with the not running containers
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
similarity index 74%
rename from helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
rename to helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
index 40d76d3d..9560570e 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
@@ -1,13 +1,35 @@
-{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: mimir-to-grafana-cloud-exporter.rules
-  namespace: {{ .Values.namespace }}
+    {{- if not .Values.mimir.enabled }}
+    cluster_type: "management_cluster"
+    {{- end }}
+  name: grafana-cloud.rules
+  namespace: {{ .Values.namespace  }}
 spec:
   groups:
+  - name: grafana-cloud
+    rules:
+    ## Pages Atlas when prometheus fails to send samples to cortex
+    - alert: PrometheusMissingGrafanaCloud
+      annotations:
+        description: 'Prometheus is not sending data to Grafana Cloud.'
+        opsrecipe: prometheus-grafanacloud/
+      {{- if .Values.mimir.enabled }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
+      {{- else }}
+      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
+      {{- end }}
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+  {{- if .Values.mimir.enabled }}
   - name: mimir-to-grafana-cloud-exporter
     rules:
     - alert: MimirToGrafanaCloudExporterDown
@@ -73,4 +95,4 @@ spec:
         severity: page
         team: atlas
         topic: observability
-{{- end }}
+  {{- end }}
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
index 39fb4a0a..97a10780 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -3,9 +3,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if not .Values.mimir.enabled }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: grafana.rules
   namespace: {{ .Values.namespace }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
index 6c90a4e2..83089fc3 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -85,7 +85,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-
     - alert: KubeConfigMapCreatedMetricMissing
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
index cd47324a..6dc13788 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -61,21 +61,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: AlloyForPrometheusRulesDown
-      annotations:
-        description: 'Alloy sending PrometheusRules to Mimir ruler is down.'
-        opsrecipe: prometheus-rules/
-      expr: count(up{job="alloy-rules", namespace="mimir"} == 0) by (cluster_id, installation, provider, pipeline) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
-        cancel_if_cluster_status_updating: "true"
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: MimirRulerEventsFailed
       annotations:
         dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index b31713f9..a0bd48fe 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,7 +1,6 @@
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
-  creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
   name: prometheus.rules
@@ -27,23 +26,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    ## Pages Atlas when prometheus fails to send samples to cortex
-    - alert: PrometheusMissingGrafanaCloud
-      annotations:
-        description: 'Prometheus is not sending data to Grafana Cloud.'
-        opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
-      {{- else }}
-      expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
-      {{- end }}
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
       annotations:
         description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
similarity index 99%
rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
rename to test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
index ee5645cf..79c5aa0f 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir-to-grafana-cloud-exporter.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-- mimir-to-grafana-cloud-exporter.rules.yml
+- grafana-cloud.rules.yml
 
 tests:
   # Tests for `MimirToGrafanaCloudExporterDown` alert
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
index 37d40af1..6bdfeaea 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -86,35 +86,6 @@ tests:
               dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
               description: "Mimir component : mimir-ingester is down."
               opsrecipe: "mimir/"
-  - interval: 1m
-    input_series:
-      # test with 1 pod: none, up, down
-      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="mimir"}'
-        values: "_x20 1+0x70 0+0x70"
-    alert_rule_test:
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 10m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 80m
-      - alertname: AlloyForPrometheusRulesDown
-        eval_time: 160m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cancel_if_outside_working_hours: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cluster_id: golem
-              installation: golem
-              provider: capa
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-            exp_annotations:
-              description: "Alloy sending PrometheusRules to Mimir ruler is down."
-              opsrecipe: "prometheus-rules/"
   - interval: 1m
     input_series:
       # test: none, rate > 0, rate = 0
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 5ae0ba2f..2764cba7 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -72,6 +72,37 @@ tests:
       - alertname: AlloyUnhealthyComponents
         eval_time: 80m
 
+  # Test AlloyForPrometheusRulesDown
+  - interval: 1m
+    input_series:
+      # test with 1 pod: none, up, down
+      - series: 'up{job="alloy-rules", cluster_type="management_cluster", cluster_id="golem", provider="capa", pipeline="testing", installation="golem", namespace="monitoring"}'
+        values: "_x20 1+0x70 0+0x70"
+    alert_rule_test:
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 10m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 80m
+      - alertname: AlloyForPrometheusRulesDown
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: golem
+              installation: golem
+              provider: capa
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Alloy sending PrometheusRules to Loki and Mimir ruler is down."
+              opsrecipe: "prometheus-rules/"
+
   # Test LoggingAgentDown
   - interval: 1m
     input_series:

From 4abf0598053b19e63ad6f7abb404aacec6b61edf Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 12 Nov 2024 08:28:34 +0100
Subject: [PATCH 06/10] add alerts for alloy-metrics (#1417)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add sensible alerts for alloy

* wip - add ongoing alerts

* add dashboard annotation

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml

* Update prometheus.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging.rules.yaml

* add missing tests

* change based on ops-recipes

* Clean up some rules a bit

* Update CHANGELOG.md

* Update helm-operations.rules.yml

* Update systemd.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-operator.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml

* Update helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml

* Update test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml

* add alerts for alloy-metrics

* improve monitoring agent down tests

* improve monitoring agent shards not satisfied tests

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 CHANGELOG.md                                  |   6 +
 .../atlas/alerting-rules/alloy.rules.yml      | 102 ++++++++-
 .../monitoring-pipeline.rules.yml             |  80 +++++++
 .../atlas/alerting-rules/prometheus.rules.yml |  59 +-----
 .../atlas/alerting-rules/alloy.rules.test.yml | 195 ++++++++++++++++++
 ...yml => monitoring-pipeline.rules.test.yml} |  60 ++++--
 6 files changed, 426 insertions(+), 76 deletions(-)
 create mode 100644 helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
 rename test/tests/providers/global/platform/atlas/alerting-rules/{prometheus.rules.test.yml => monitoring-pipeline.rules.test.yml} (58%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1039fd1e..b2ccb835 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `LoggingAgentDown` to be alerted when the logging agent is down.
   - `LogForwardingErrors` to be alerted when the `loki.write` component is failing.
   - `LogReceivingErrors` to be alerted when the `loki.source.api` components of the gateway is failing.
+  - `MonitoringAgentDown` to be alerted when the monitoring agent is down.
+  - `MonitoringAgentShardsNotSatisfied` to be alerted when the monitoring agent is missing any number of desired shards.
 
 ### Changed
 
@@ -23,6 +25,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `observability-gateway`
 - Move all `grafana-cloud` related alerts to their own file.
 - Move all alloy related alerts to the alloy alert file.
+- Rename and move the following alerts as they are not specific to Prometheus:
+  - `PrometheusCriticalJobScrapingFailure` => `CriticalJobScrapingFailure`
+  - `PrometheusJobScrapingFailure` => `JobScrapingFailure`
+  - `PrometheusFailsToCommunicateWithRemoteStorageAPI` => `MetricForwardingErrors`
 
 ## [4.23.0] - 2024-10-30
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index e2d8345f..fc364f28 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -1,5 +1,5 @@
 # This files describe common alloy alerting rules
-# For alerts regarding monitoring and logging agents, please go to the respective files (logging.rules.yml and monitoring.rules.yml).
+# For alerts regarding the monitoring pipeline and the logging pipeline, please go to the respective files (logging-pipeline.rules.yml and monitoring-pipeline.rules.yml).
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -91,3 +91,103 @@ spec:
             cancel_if_cluster_status_updating: "true"
             cancel_if_node_unschedulable: "true"
             cancel_if_node_not_ready: "true"
+    - name: alloy.metrics
+      rules:
+        # This alert pages if monitoring-agent fails to send samples to its remote write endpoint.
+        - alert: MonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 20m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_cluster_has_no_workers: "true"
+        ## Same as MonitoringAgentDown, but triggers inhibition earlier and does not page.
+        - alert: InhibitionMonitoringAgentDown
+          annotations:
+            description: '{{`Monitoring agent fails to send samples.`}}'
+            summary: Monitoring agent fails to send samples to remote write endpoint.
+            opsrecipe: alloy/#monitoring-agent-down
+            dashboard: promRW001/prometheus-remote-write
+          expr: |-
+            count(
+              label_replace(
+                capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
+                "cluster_id",
+                "$1",
+                "name",
+                "(.*)"
+              ) == 1
+            ) by (cluster_id, installation, pipeline, provider) > 0
+              unless on (cluster_id) (
+              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            )
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+        ## This alert pages if any of the monitoring-agent shard is not running.
+        - alert: MonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 40m
+          labels:
+            area: platform
+            severity: page
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
+            cancel_if_outside_working_hours: "true"
+        ## Same as MonitoringAgentShardsNotSatisfied but triggers inhibition earlier, and does not page.
+        - alert: InhibitionMonitoringAgentShardsNotSatisfied
+          annotations:
+            description: '{{`At least one of the monitoring agent shard is missing.`}}'
+            summary: Monitoring agent is missing shards.
+            opsrecipe: alloy/#monitoring-agent-down
+          expr: |-
+            kube_statefulset_status_replicas{statefulset="alloy-metrics"}
+              - kube_statefulset_status_replicas_ready{statefulset="alloy-metrics"}
+              > 0
+          for: 2m
+          labels:
+            area: platform
+            severity: none
+            team: atlas
+            topic: observability
+            inhibit_monitoring_agent_down: "true"
+            cancel_if_cluster_status_creating: "true"
+            cancel_if_cluster_status_deleting: "true"
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
new file mode 100644
index 00000000..e666ea27
--- /dev/null
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
@@ -0,0 +1,80 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    {{- include "labels.common" . | nindent 4 }}
+  name: monitoring-pipeline.rules
+  namespace: {{ .Values.namespace }}
+spec:
+  groups:
+  - name: monitoring-pipeline
+    rules:
+    - alert: MetricForwardingErrors
+      annotations:
+        description: '{{`Monitoring agent can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
+        opsrecipe: monitoring-pipeline/
+        dashboard: promRW001/prometheus-remote-write
+      expr: |-
+        rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1
+          or rate(prometheus_remote_storage_samples_total[10m]) == 0
+          or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
+      for: 1h
+      labels:
+        area: platform
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
+    - alert: JobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      expr: |-
+        (
+          count(up == 0) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(up) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 1d
+      labels:
+        area: platform
+        severity: notify
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+    - alert: CriticalJobScrapingFailure
+      annotations:
+        dashboard: servicemonitors-details/servicemonitors-details
+        description: '{{`Monitoring agents for cluster {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}'
+        summary: Monitoring agent failed to scrape all targets in a job.
+        opsrecipe: monitoring-job-scraping-failure/
+      ## We ignore bastion hosts node exporters
+      expr: |-
+        (
+          count(
+            (
+              up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+              or
+              up{job="kubelet", metrics_path="/metrics"}
+            ) == 0
+          ) by (job, installation, cluster_id, provider, pipeline)
+          /
+          count(
+            up{job=~".*(apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics).*"}
+            or
+            up{job="kubelet", metrics_path="/metrics"}
+          ) by (job, installation, cluster_id, provider, pipeline)
+        ) >= 1
+      for: 3d
+      labels:
+        area: platform
+        severity: page
+        team: atlas
+        topic: observability
+        cancel_if_outside_working_hours: "true"
+        cancel_if_cluster_is_not_running_monitoring_agent: "true"
+        cancel_if_cluster_status_creating: "true"
+        cancel_if_cluster_status_deleting: "true"
+
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
index a0bd48fe..7b48759a 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus.rules.yml
@@ -1,3 +1,4 @@
+# TODO(@giantswarm/team-atlas): revisit once vintage is gone
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -26,19 +27,6 @@ spec:
         severity: page
         team: atlas
         topic: observability
-    - alert: PrometheusFailsToCommunicateWithRemoteStorageAPI
-      annotations:
-        description: '{{`Prometheus can''t communicate with Remote Storage API at {{ $labels.url }}.`}}'
-        opsrecipe: prometheus-cant-communicate-with-remote-storage-api/
-        dashboard: promRW001/prometheus-remote-write
-      expr: rate(prometheus_remote_storage_samples_failed_total[10m]) > 0.1 or rate(prometheus_remote_storage_samples_total[10m]) == 0 or rate(prometheus_remote_storage_metadata_retried_total[10m]) > 0
-      for: 1h
-      labels:
-        area: platform
-        cancel_if_outside_working_hours: "true"
-        severity: page
-        team: atlas
-        topic: observability
     - alert: PrometheusRuleFailures
       annotations:
         description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to evaluate rule(s) {{ printf "%.2f" $value }} time(s).`}}
@@ -52,48 +40,3 @@ spec:
         team: atlas
         topic: observability
         cancel_if_outside_working_hours: "true"
-    - alert: PrometheusJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      expr: (count(up == 0) BY (job, installation, cluster_id, provider, pipeline) / count(up) BY (job, installation, cluster_id, provider, pipeline)) == 1
-      for: 1d
-      labels:
-        area: platform
-        severity: notify
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-    - alert: PrometheusCriticalJobScrapingFailure
-      annotations:
-        description: {{`Prometheus {{$labels.installation}}/{{$labels.cluster_id}} has failed to scrape all targets in {{$labels.job}} job.`}}
-        summary: Prometheus fails to scrape all targets in a job.
-        opsrecipe: prometheus-job-scraping-failure/
-      ## We ignore bastion hosts node exporters
-      expr: |-
-        (
-          count(
-            (
-              up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-              or
-              up{job="kubelet", metrics_path="/metrics"}
-            ) == 0
-          ) BY (job, installation, cluster_id, provider, pipeline)
-          /
-          count(
-            up{job=~"apiserver|kube-controller-manager|kube-scheduler|node-exporter|kube-state-metrics"}
-            or
-            up{job="kubelet", metrics_path="/metrics"}
-          ) BY (job, installation, cluster_id, provider, pipeline)
-        ) == 1
-      for: 3d
-      labels:
-        area: platform
-        severity: page
-        team: atlas
-        topic: observability
-        cancel_if_outside_working_hours: "true"
-        cancel_if_cluster_is_not_running_monitoring_agent: "true"
-        cancel_if_cluster_status_creating: "true"
-        cancel_if_cluster_status_deleting: "true"
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
index 2764cba7..98549b42 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/alloy.rules.test.yml
@@ -225,3 +225,198 @@ tests:
               dashboard: "53c1ecddc3a1d5d4b8d6cd0c23676c31/alloy-logs-overview"
               description: "Scraping of all logging-agent pods to check if one failed every 30 minutes."
               opsrecipe: "alloy/"
+
+  # Test MonitoringAgentDown
+  - interval: 1m
+    input_series:
+      - series: 'up{job="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "_x40 1+0x50 0+0x70"
+      - series: 'capi_cluster_status_condition{type="ControlPlaneReady", status="True", name="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "1x150"
+    alert_rule_test:
+      - alertname: MonitoringAgentDown
+        eval_time: 10m
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 10m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: MonitoringAgentDown
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_has_no_workers: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: MonitoringAgentDown
+        eval_time: 80m
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 80m
+      - alertname: MonitoringAgentDown
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_has_no_workers: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+      - alertname: InhibitionMonitoringAgentDown
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              inhibit_monitoring_agent_down: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: "Monitoring agent fails to send samples."
+              opsrecipe: "alloy/#monitoring-agent-down"
+              dashboard: "promRW001/prometheus-remote-write"
+              summary: "Monitoring agent fails to send samples to remote write endpoint."
+
+  # Test MonitoringAgentShardsNotSatisfied
+  - interval: 1m
+    input_series:
+      - series: 'kube_statefulset_status_replicas{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "3+0x10 3+0x90 3+0x50"
+      - series: 'kube_statefulset_status_replicas_ready{statefulset="alloy-metrics", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing"}'
+        values: "3+0x10 2+0x90 3+0x50"
+    alert_rule_test:
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 10m
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 30m
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 30m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 60m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: page
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 60m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+              severity: none
+              statefulset: alloy-metrics
+              team: atlas
+              topic: observability
+              inhibit_monitoring_agent_down: "true"
+            exp_annotations:
+              description: "At least one of the monitoring agent shard is missing."
+              summary: "Monitoring agent is missing shards."
+              opsrecipe: "alloy/#monitoring-agent-down"
+      - alertname: MonitoringAgentShardsNotSatisfied
+        eval_time: 130m
+      - alertname: InhibitionMonitoringAgentShardsNotSatisfied
+        eval_time: 130m
diff --git a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
similarity index 58%
rename from test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml
rename to test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
index 77cdd216..ad97acbb 100644
--- a/test/tests/providers/global/platform/atlas/alerting-rules/prometheus.rules.test.yml
+++ b/test/tests/providers/global/platform/atlas/alerting-rules/monitoring-pipeline.rules.test.yml
@@ -1,13 +1,13 @@
 ---
 rule_files:
-  - prometheus.rules.yml
+  - monitoring-pipeline.rules.yml
 
 # Setting evaluation interval to 1h
 # to make it faster on long test duration.
 evaluation_interval: 1h
 
 tests:
-  # Test PrometheusJobScrapingFailure and PrometheusCriticalJobScrapingFailure
+  # Test JobScrapingFailure and CriticalJobScrapingFailure
   - interval: 1h
     input_series:
       - series: 'up{job="apiserver", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}'
@@ -30,14 +30,14 @@ tests:
       - series: 'up{job="app-exporter", installation="gauss", cluster_id="gauss", provider="aws", pipeline="testing"}'
         values: "1+0x120 0+0x120"
     alert_rule_test:
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 30m
-      - alertname: PrometheusJobScrapingFailure
+      - alertname: JobScrapingFailure
         eval_time: 1d
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 4d
       # This alert fires for both critical and non-critical targets
-      - alertname: PrometheusJobScrapingFailure
+      - alertname: JobScrapingFailure
         eval_time: 7d
         exp_alerts:
           - exp_labels:
@@ -52,9 +52,10 @@ tests:
               pipeline: "testing"
               job: "kube-controller-manager"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job."
           - exp_labels:
               area: platform
               severity: notify
@@ -67,12 +68,13 @@ tests:
               pipeline: "testing"
               job: "app-exporter"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in app-exporter job."
-
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in app-exporter job."
+  
       # This fires only for critical target down.
-      - alertname: PrometheusCriticalJobScrapingFailure
+      - alertname: CriticalJobScrapingFailure
         eval_time: 9d
         exp_alerts:
           - exp_labels:
@@ -90,6 +92,30 @@ tests:
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
             exp_annotations:
-              opsrecipe: "prometheus-job-scraping-failure/"
-              summary: "Prometheus fails to scrape all targets in a job."
-              description: "Prometheus gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+              dashboard: servicemonitors-details/servicemonitors-details
+              opsrecipe: "monitoring-job-scraping-failure/"
+              summary: "Monitoring agent failed to scrape all targets in a job."
+              description: "Monitoring agents for cluster gauss/gauss has failed to scrape all targets in kube-controller-manager job."
+
+
+  # Test MetricForwardingErrors
+  - interval: 1m
+    input_series:
+      # remote write has no failure for 1 hour and then fails for 2 hours
+      - series: 'prometheus_remote_storage_samples_failed_total{url="http://remote-storage_samples_failed_total"}'
+        values: "0+0x60 0+100x120"
+    alert_rule_test:
+      - alertname: MetricForwardingErrors
+        eval_time: 180m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_outside_working_hours: "true"
+              url: "http://remote-storage_samples_failed_total"
+            exp_annotations:
+              description: "Monitoring agent can't communicate with Remote Storage API at http://remote-storage_samples_failed_total."
+              opsrecipe: "monitoring-pipeline/"
+              dashboard: "promRW001/prometheus-remote-write"

From bf0d4f5481599e4e2e03a989a6a5978b15fe1e1c Mon Sep 17 00:00:00 2001
From: Taylor Bot <dev@giantswarm.io>
Date: Tue, 12 Nov 2024 08:54:16 +0100
Subject: [PATCH 07/10] Release v4.24.0 (#1421)

---
 CHANGELOG.md                     | 5 ++++-
 helm/prometheus-rules/Chart.yaml | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2ccb835..ee1e594b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [4.24.0] - 2024-11-12
+
 ### Added
 
 - Add a set of sensible alerts to monitor alloy.
@@ -3213,7 +3215,8 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac
 
 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2
 
-[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.23.0...HEAD
+[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.0...HEAD
+[4.24.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.23.0...v4.24.0
 [4.23.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.22.0...v4.23.0
 [4.22.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.21.1...v4.22.0
 [4.21.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.21.0...v4.21.1
diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml
index 14a95830..472d9aec 100644
--- a/helm/prometheus-rules/Chart.yaml
+++ b/helm/prometheus-rules/Chart.yaml
@@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules
 icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png
 name: prometheus-rules
 appVersion: '0.1.0'
-version: '4.23.0'
+version: '4.24.0'
 annotations:
   application.giantswarm.io/team: "atlas"
   config.giantswarm.io/version: 1.x.x

From ef8a8d6ff8c97ebda93bc486e43ce138bbd621f1 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 12 Nov 2024 10:44:16 +0100
Subject: [PATCH 08/10] fix monitoring agent down alert (#1422)

* fix monitoring agent down alert

* add old tests back
---
 CHANGELOG.md                                  |   4 +
 .../atlas/alerting-rules/alloy.rules.yml      |   8 +-
 .../alerting-rules/prometheus-agent.rules.yml |  32 +----
 .../prometheus-agent.rules.test.yml           | 122 ------------------
 4 files changed, 10 insertions(+), 156 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ee1e594b..5d16be15 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing.
+
 ## [4.24.0] - 2024-11-12
 
 ### Added
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
index fc364f28..80c5361a 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/alloy.rules.yml
@@ -110,8 +110,8 @@ spec:
                 "(.*)"
               ) == 1
             ) by (cluster_id, installation, pipeline, provider) > 0
-              unless on (cluster_id) (
-              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            unless on (cluster_id) (
+              count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id)
             )
           for: 20m
           labels:
@@ -140,8 +140,8 @@ spec:
                 "(.*)"
               ) == 1
             ) by (cluster_id, installation, pipeline, provider) > 0
-              unless on (cluster_id) (
-              count(up{job="alloy-metrics"} > 0) by (cluster_id)
+            unless on (cluster_id) (
+              count(up{job=~"alloy-metrics|prometheus-agent"} > 0) by (cluster_id)
             )
           for: 2m
           labels:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index b0c8e218..99a34f86 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -9,6 +9,7 @@ spec:
   groups:
   - name: prometheus-agent
     rules:
+    {{- if not .Values.mimir.enabled }}
     ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
     - alert: PrometheusAgentFailing
       annotations:
@@ -16,7 +17,6 @@ spec:
         summary: Prometheus agent fails to send samples to remote write endpoint.
         opsrecipe: prometheus-agent/
         dashboard: promRW001/prometheus-remote-write
-      {{- if not .Values.mimir.enabled }}
       expr: |-
         max_over_time(
           sum by (cluster_type, cluster_id, installation, instance, service)
@@ -26,20 +26,6 @@ spec:
             absent(up{instance="prometheus-agent"}) == 1
           )[5m:]
         )
-      {{- else }}
-      expr: |-
-        (
-          label_replace(
-            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-            "cluster_id",
-            "$1",
-            "name",
-            "(.*)"
-          ) == 1
-        ) unless on (cluster_id) (
-          count(up{job="prometheus-agent"} > 0) by (cluster_id)
-        )
-      {{- end }}
       for: 20m
       labels:
         area: platform
@@ -58,7 +44,6 @@ spec:
         summary: Prometheus agent fails to send samples to remote write endpoint.
         opsrecipe: prometheus-agent/
         dashboard: promRW001/prometheus-remote-write
-      {{- if not .Values.mimir.enabled }}
       expr: |-
         max_over_time(
           sum by (cluster_type, cluster_id, installation, instance, service)
@@ -68,20 +53,6 @@ spec:
             absent(up{instance="prometheus-agent"}) == 1
           )[5m:]
         )
-      {{- else }}
-      expr: |-
-        (
-          label_replace(
-            capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
-            "cluster_id",
-            "$1",
-            "name",
-            "(.*)"
-          ) == 1
-        ) unless on (cluster_id) (
-          count(up{job="prometheus-agent"} > 0) by (cluster_id)
-        )
-      {{- end }}
       for: 2m
       labels:
         area: platform
@@ -92,6 +63,7 @@ spec:
         cancel_if_cluster_is_not_running_monitoring_agent: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
+    {{- end }}
     ## This alert pages if one of the prometheus-agent shard is not running.
     - alert: PrometheusAgentShardsMissing
       annotations:
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index 01aebe6c..bd05e856 100644
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -4,128 +4,6 @@ rule_files:
 - prometheus-agent.rules.yml
 
 tests:
-  # Tests for `PrometheusAgentFailing` alert
-  - interval: 1m
-    input_series:
-      - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}'
-        values: "_x60  0+0x60 1+0x60"
-      - series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}'
-        values: "1+0x180"
-    alert_rule_test:
-      - alertname: PrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cluster_id: "gauss"
-              cluster_type: "workload_cluster"
-              customer: "giantswarm"
-              installation: "myinstall"
-              name: "gauss"
-              pipeline: "testing"
-              provider: "capa"
-              region: "eu-west-2"
-              status: "True"
-              type: "ControlPlaneReady"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cluster_id: "gauss"
-              cluster_type: "workload_cluster"
-              customer: "giantswarm"
-              installation: "myinstall"
-              name: "gauss"
-              pipeline: "testing"
-              provider: "capa"
-              region: "eu-west-2"
-              status: "True"
-              type: "ControlPlaneReady"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              customer: "giantswarm"
-              name: "gauss"
-              pipeline: "testing"
-              provider: "capa"
-              region: "eu-west-2"
-              status: "True"
-              type: "ControlPlaneReady"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              customer: "giantswarm"
-              name: "gauss"
-              pipeline: "testing"
-              provider: "capa"
-              region: "eu-west-2"
-              status: "True"
-              type: "ControlPlaneReady"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 150m
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 150m
   # Tests for `PrometheusAgentShardsMissing` alert
   - interval: 1m
     input_series:

From f01631d1f485d5e2c0a868d2840d80b332c7cc2c Mon Sep 17 00:00:00 2001
From: Taylor Bot <dev@giantswarm.io>
Date: Tue, 12 Nov 2024 10:46:04 +0100
Subject: [PATCH 09/10] Release v4.24.1 (#1423)

---
 CHANGELOG.md                     | 5 ++++-
 helm/prometheus-rules/Chart.yaml | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d16be15..8c84e242 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [4.24.1] - 2024-11-12
+
 ### Fixed
 
 - Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing.
@@ -3219,7 +3221,8 @@ Fix `PromtailRequestsErrors` alerts as promtail retries after some backoff so ac
 
 - Add existing rules from https://github.com/giantswarm/prometheus-meta-operator/pull/637/commits/bc6a26759eb955de92b41ed5eb33fa37980660f2
 
-[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.0...HEAD
+[Unreleased]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.1...HEAD
+[4.24.1]: https://github.com/giantswarm/prometheus-rules/compare/v4.24.0...v4.24.1
 [4.24.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.23.0...v4.24.0
 [4.23.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.22.0...v4.23.0
 [4.22.0]: https://github.com/giantswarm/prometheus-rules/compare/v4.21.1...v4.22.0
diff --git a/helm/prometheus-rules/Chart.yaml b/helm/prometheus-rules/Chart.yaml
index 472d9aec..e56b51a3 100644
--- a/helm/prometheus-rules/Chart.yaml
+++ b/helm/prometheus-rules/Chart.yaml
@@ -5,7 +5,7 @@ home: https://github.com/giantswarm/prometheus-rules
 icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png
 name: prometheus-rules
 appVersion: '0.1.0'
-version: '4.24.0'
+version: '4.24.1'
 annotations:
   application.giantswarm.io/team: "atlas"
   config.giantswarm.io/version: 1.x.x

From de44204245bb1e0790ef24400a8b116433b68c39 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 12 Nov 2024 13:22:28 +0100
Subject: [PATCH 10/10] replace mimir.enabled property with mc provider flavor
 as mimir is everywhere in capi (#1424)

* replace mimir.enabled property with mc provider flavor as mimir is everywhere in capi

* Update test/conf/providers
---
 CHANGELOG.md                                  |   4 +
 README.md                                     |   6 +-
 .../templates/alloy-rules-configmap.yaml      |   2 +-
 .../templates/alloy-rules.yaml                |   2 +-
 .../aws-load-balancer-controller.rules.yml    |   4 +-
 .../aws.node.workload-cluster.rules.yml       |   4 +-
 .../aws.workload-cluster.rules.yml            |   2 +-
 .../capa.management-cluster.rules.yml         |   4 +-
 .../phoenix/alerting-rules/irsa.rules.yml     |   4 +-
 .../apiserver.management-cluster.rules.yml    |   4 +-
 .../apiserver.workload-cluster.rules.yml      |   4 +-
 .../capi.management-cluster.rules.yml         |   4 +-
 .../certificate.management-cluster.rules.yml  |   4 +-
 .../certificate.workload-cluster.rules.yml    |   4 +-
 .../cluster-autoscaler.rules.yml              |   4 +-
 .../etcd.management-cluster.rules.yml         |   4 +-
 .../etcd.workload-cluster.rules.yml           |   4 +-
 .../alerting-rules/etcdbackup.rules.yml       |   4 +-
 .../alerting-rules/inhibit.nodes.rules.yml    |   4 +-
 .../management-cluster.rules.yml              |   4 +-
 .../node.management-cluster.rules.yml         |   4 +-
 .../node.workload-cluster.rules.yml           |   4 +-
 .../alerting-rules/pods.core.rules.yml        |   2 +-
 .../storage.management-cluster.rules.yml      |   4 +-
 .../storage.workload-cluster.rules.yml        |   4 +-
 .../loki-ruler-datasource-configmap.yaml      |   2 +-
 .../deployment.management-cluster.rules.yml   |   4 +-
 .../deployment.workload-cluster.rules.yml     |   4 +-
 .../alerting-rules/grafana-cloud.rules.yml    |   6 +-
 .../atlas/alerting-rules/grafana.rules.yml    |   2 +-
 .../kube-state-metrics.rules.yml              |  22 +-
 .../atlas/alerting-rules/mimir.rules.yml      |   2 +-
 .../alerting-rules/prometheus-agent.rules.yml |   2 +-
 .../prometheus-meta-operator.rules.yml        |   2 +-
 .../alerting-rules/statefulset.rules.yml      |   4 +-
 .../atlas/alerting-rules/storage.rules.yml    |   4 +-
 .../recording-rules/grafana-cloud.rules.yml   |   4 +-
 .../recording-rules/mimir-mixins.rules.yml    |   2 +-
 ...oring.resource-usage-estimation.rules.yaml |   2 +-
 .../alerting-rules/external-dns.rules.yml     |   4 +-
 .../honeybadger/alerting-rules/app.rules.yml  |   4 +-
 .../shield/alerting-rules/dex.rules.yml       |   6 +-
 .../shield/alerting-rules/falco.rules.yml     |   4 +-
 helm/prometheus-rules/values.schema.json      |   8 -
 helm/prometheus-rules/values.yaml             |   3 -
 mimir/update.sh                               |   2 +-
 test/conf/providers                           |   1 -
 test/hack/bin/run-pint.sh                     |   6 +-
 test/hack/bin/template-chart.sh               |   2 -
 .../prometheus-agent.rules.test.yml           | 216 ----------
 .../alerting-rules/zot.rules.test.yml         |  54 ---
 .../capa.inhibition.rules.test.yml            |   0
 .../capi-cluster.rules.test.yml               |  69 +++
 .../capi-kubeadmcontrolplane.rules.test.yml   |  52 +++
 .../capi-machine.rules.test.yml               |  49 +++
 .../capi-machinedeployment.rules.test.yml     |  47 +++
 .../capi-machinepool.rules.test.yml           |  47 +++
 .../capi-machineset.rules.test.yml            |  27 ++
 .../alerting-rules/capi.rules.test.yml        |  91 ++++
 .../certificate.all.rules.test.yml            |  94 +++++
 .../node-exporter.rules.test.yml              |  38 ++
 .../alerting-rules/grafana-cloud.test.yml     |   0
 .../atlas/alerting-rules/mimir.rules.test.yml |   0
 .../prometheus-agent.rules.test.yml           |  92 ----
 .../cert-manager.rules.test.yml               |  46 ++
 .../alerting-rules/teleport.rules.test.yml    |  59 +++
 .../alerting-rules/grafana-cloud.test.yml     | 156 +++++++
 .../atlas/alerting-rules/mimir.rules.test.yml | 392 ++++++++++++++++++
 .../prometheus-agent.rules.test.yml           |  92 ----
 69 files changed, 1261 insertions(+), 556 deletions(-)
 delete mode 100644 test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
 delete mode 100644 test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml
 rename test/tests/providers/capi/{capa-mimir => capa}/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml (100%)
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml
 rename test/tests/providers/capi/{capa-mimir => capa}/platform/atlas/alerting-rules/grafana-cloud.test.yml (100%)
 rename test/tests/providers/capi/{capa-mimir => capa}/platform/atlas/alerting-rules/mimir.rules.test.yml (100%)
 create mode 100644 test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml
 create mode 100644 test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml
 create mode 100644 test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml
 create mode 100644 test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c84e242..485250ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Removed
+
+- Remove the `mimir.enabled` property to replace it with the MC flavor as all CAPI MCs now run Mimir.
+
 ## [4.24.1] - 2024-11-12
 
 ### Fixed
diff --git a/README.md b/README.md
index a704e0a9..c6e50b28 100644
--- a/README.md
+++ b/README.md
@@ -168,11 +168,11 @@ There are 2 kinds of tests on rules:
    ```
    [...]
    ###  Testing platform/atlas/alerting-rules/prometheus-operator.rules.yml
-   ###    promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-operator.rules.yml
+   ###    promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus-operator.rules.yml
    ###    Skipping platform/atlas/alerting-rules/prometheus-operator.rules.yml: listed in test/conf/promtool_ignore
    ###  Testing platform/atlas/alerting-rules/prometheus.rules.yml
-   ###    promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa-mimir/platform/atlas/alerting-rules/prometheus.rules.yml
-   ###    promtool test rules prometheus.rules.test.yml - capi/capa-mimir
+   ###    promtool check rules /home/marie/github-repo/prometheus-rules/test/hack/output/generated/capi/capa/platform/atlas/alerting-rules/prometheus.rules.yml
+   ###    promtool test rules prometheus.rules.test.yml - capi/capa
    [...]
    09:06:29 promtool: end (Elapsed time: 1s)
    Congratulations!  Prometheus rules have been promtool checked and tested
diff --git a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml
index 5bb93b84..54d8d51f 100644
--- a/helm/prometheus-rules/templates/alloy-rules-configmap.yaml
+++ b/helm/prometheus-rules/templates/alloy-rules-configmap.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.mimir.enabled }}
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
diff --git a/helm/prometheus-rules/templates/alloy-rules.yaml b/helm/prometheus-rules/templates/alloy-rules.yaml
index ef23d191..0132c989 100644
--- a/helm/prometheus-rules/templates/alloy-rules.yaml
+++ b/helm/prometheus-rules/templates/alloy-rules.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.mimir.enabled }}
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: application.giantswarm.io/v1alpha1
 kind: App
 metadata:
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml
index 2f5e080f..24863fe1 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml
@@ -5,9 +5,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: aws-load-balancer-controller.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
index 104e1886..6d2ace5c 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: node.aws.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
index 1306de63..db06f9b0 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
@@ -5,7 +5,7 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
     {{- end }}
   name: aws.workload-cluster.rules
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
index 1e9cdb2e..32d0848f 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
@@ -6,9 +6,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: capa.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml
index 993ca2c0..e1fd083d 100644
--- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml
@@ -3,9 +3,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: irsa.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml
index e0877f4f..d69bcdc1 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: apiserver.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml
index d23245c8..f26e6481 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/apiserver.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: apiserver.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml
index aed92be3..aba6ac4d 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/capi.management-cluster.rules.yml
@@ -4,9 +4,9 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4}}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: capi.management-cluster.rules
   namespace: {{.Values.namespace}}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml
index c113c46d..db0538d2 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: certificate.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml
index 70def5ee..86027745 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/certificate.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: certificate.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
index c47475cb..c44e1e9e 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/cluster-autoscaler.rules.yml
@@ -5,9 +5,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: cluster-autoscaler.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml
index b28bdece..790646a8 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: etcd.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml
index 222edb37..44aa8e9f 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcd.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: etcd.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml
index 7dea38ee..4291a1a7 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/etcdbackup.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: etcdbackup.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml
index 735a771d..984fa707 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/inhibit.nodes.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: inhibit.nodes.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml
index 17865dc5..6f8fa87c 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml
index d67f6427..5ab9ac30 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: node.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml
index 0507246f..6a30a570 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/node.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: node.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml
index 61dced93..0bd99a50 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/pods.core.rules.yml
@@ -4,7 +4,7 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
     {{- end }}
   name: pods.core.rules
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml
index 9f27fb3c..59151577 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: core.storage.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml
index 72b7d6e0..a19a9035 100644
--- a/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/kaas/turtles/alerting-rules/storage.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: core.storage.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml
index 988bce7e..60d9a16e 100644
--- a/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml
+++ b/helm/prometheus-rules/templates/loki-ruler-datasource-configmap.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.mimir.enabled }}
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
index 1f98fe45..be6a9f5a 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: deployment.management-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
index fa908733..ca7422b1 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end}}
+    {{- end }}
   name: deployment.workload-cluster.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
index 9560570e..2022f4fd 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana-cloud.rules.yml
@@ -3,7 +3,7 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
     {{- end }}
   name: grafana-cloud.rules
@@ -17,7 +17,7 @@ spec:
       annotations:
         description: 'Prometheus is not sending data to Grafana Cloud.'
         opsrecipe: prometheus-grafanacloud/
-      {{- if .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "capi" }}
       expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
       {{- else }}
       expr: absent(prometheus_remote_storage_samples_total{remote_name="grafana-cloud"})
@@ -29,7 +29,7 @@ spec:
         severity: page
         team: atlas
         topic: observability
-  {{- if .Values.mimir.enabled }}
+  {{- if eq .Values.managementCluster.provider.flavor "capi" }}
   - name: mimir-to-grafana-cloud-exporter
     rules:
     - alert: MimirToGrafanaCloudExporterDown
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
index 97a10780..977840aa 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/grafana.rules.yml
@@ -3,7 +3,7 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
     {{- end }}
   name: grafana.rules
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
index 83089fc3..7fa5beeb 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/kube-state-metrics.rules.yml
@@ -14,7 +14,7 @@ spec:
       annotations:
         description: '{{`KubeStateMetrics is down.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: |-
         label_replace(up{job="kube-state-metrics",instance=~".*:8080"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 or absent(up{job="kube-state-metrics",instance=~".*:8080"} == 1)
       {{- else }}
@@ -89,7 +89,7 @@ spec:
       annotations:
         description: '{{`kube_configmap_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_configmap_created{})
       {{- else }}
       expr: |-
@@ -117,7 +117,7 @@ spec:
       annotations:
         description: '{{`kube_daemonset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_daemonset_created{})
       {{- else }}
       expr: |-
@@ -145,7 +145,7 @@ spec:
       annotations:
         description: '{{`kube_deployment_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_deployment_created{})
       {{- else }}
       expr: |-
@@ -173,7 +173,7 @@ spec:
       annotations:
         description: '{{`kube_endpoint_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_endpoint_created{})
       {{- else }}
       expr: |-
@@ -201,7 +201,7 @@ spec:
       annotations:
         description: '{{`kube_namespace_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_namespace_created{})
       {{- else }}
       expr: |-
@@ -229,7 +229,7 @@ spec:
       annotations:
         description: '{{`kube_node_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_node_created{})
       {{- else }}
       expr: |-
@@ -257,7 +257,7 @@ spec:
       annotations:
         description: '{{`kube_pod_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_pod_created{})
       {{- else }}
       expr: |-
@@ -285,7 +285,7 @@ spec:
       annotations:
         description: '{{`kube_replicaset_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_replicaset_created{})
       {{- else }}
       expr: |-
@@ -313,7 +313,7 @@ spec:
       annotations:
         description: '{{`kube_secret_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_secret_created{})
       {{- else }}
       expr: |-
@@ -341,7 +341,7 @@ spec:
       annotations:
         description: '{{`kube_service_created metric is missing for cluster {{ $labels.cluster_id }}.`}}'
         opsrecipe: kube-state-metrics-down/
-      {{- if not .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
       expr: absent(kube_service_created{})
       {{- else }}
       expr: |-
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
index 6dc13788..294c5d15 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml
@@ -1,4 +1,4 @@
-{{- if .Values.mimir.enabled }}
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
index 99a34f86..73c749b4 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-agent.rules.yml
@@ -9,7 +9,7 @@ spec:
   groups:
   - name: prometheus-agent
     rules:
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     ## This alert pages if prometheus-agent fails to send samples to its remote write endpoint.
     - alert: PrometheusAgentFailing
       annotations:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml
index 98865562..ff81b5e4 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/prometheus-meta-operator.rules.yml
@@ -9,7 +9,7 @@ spec:
   groups:
   - name: observability
     rules:
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     - alert: "Heartbeat"
       expr: up{job=~".*prometheus/prometheus.*",instance!="prometheus-agent"} == 1
       labels:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
index 1c546f35..439a9642 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: statefulset.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
index 7b0798d5..8490e4a7 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: observability.storage.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml
index 3ebe0897..20bee678 100644
--- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/grafana-cloud.rules.yml
@@ -344,7 +344,7 @@ spec:
     rules:
     - expr: sum(ALERTS{alertstate="firing"}) by (alertname, cluster_id, cluster_type, customer, installation, pipeline, provider, region, area, severity, team, topic)
       record: aggregation:prometheus:alerts
-    {{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     # Metric container_memory_working_set_bytes comes from the cAdvisor component scraped on management clusters which is then scraped by the management cluster prometheus.
     # This means the cluster_id label on this metric will be the cluster_id of the management cluster for all the series, not the workload cluster id.
     # As we want to record the memory usage of the prometheis per cluster, we need to extract the cluster id from the prometheus pod name (i.e. pod=prometheus-xyz-ordinal => cluster_id=xyz).
@@ -353,7 +353,7 @@ spec:
     - expr: sum(label_replace(container_memory_working_set_bytes{container='prometheus', namespace=~'.*-prometheus'}, "cluster_id", "$2", "pod", "(prometheus-)(.+)(-.+)")) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region)
       record: aggregation:prometheus:memory_usage
     {{- end }}
-  {{- if .Values.mimir.enabled }}
+  {{- if eq .Values.managementCluster.provider.flavor "capi" }}
   - name: mimir.grafana-cloud.recording
     rules:
     - expr: sum(container_memory_working_set_bytes{namespace='mimir', cluster_type="management_cluster", container=~'.+'}) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region)
diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml
index d41a406b..7d0247b6 100644
--- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml
@@ -1,4 +1,4 @@
-{{- if .Values.mimir.enabled }}
+{{- if eq .Values.managementCluster.provider.flavor "capi" }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml
index 81a946f0..c402ff83 100644
--- a/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml
+++ b/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yaml
@@ -9,7 +9,7 @@ spec:
   groups:
   - name: monitoring.resource-usage-estimation.recording
     rules:
-    {{- if .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "capi" }}
     - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_usage_bytes{container="ingester", namespace="mimir"}) by (cluster_id)
       record: giantswarm:observability:monitoring:resource_usage_estimation:memory_usage_bytes
     - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, job) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_working_set_bytes{container="ingester", namespace="mimir"}) by (cluster_id)
diff --git a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml
index 6ba5a7fa..d7557af5 100644
--- a/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml
+++ b/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "workload_cluster"
-{{- end }}
+    {{- end }}
   name: external-dns.rules
   namespace: {{ .Values.namespace }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
index fac50490..fc7af2fc 100644
--- a/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
+++ b/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/app.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: app.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml
index 45347804..2905ee3d 100644
--- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml
+++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: dex.rules
   namespace: {{ .Values.namespace  }}
 spec:
@@ -41,7 +41,7 @@ spec:
       annotations:
         description: '{{`dex-operator did not register a dex-app in giantswarm namespace.`}}'
         opsrecipe: dex-operator/
-      {{- if .Values.mimir.enabled }}
+      {{- if eq .Values.managementCluster.provider.flavor "capi" }}
       expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
       {{- else }}
       expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster"}) == 1
diff --git a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml
index 61cd126f..20349e01 100644
--- a/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml
+++ b/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml
@@ -4,9 +4,9 @@ metadata:
   creationTimestamp: null
   labels:
     {{- include "labels.common" . | nindent 4 }}
-{{- if not .Values.mimir.enabled }}
+    {{- if eq .Values.managementCluster.provider.flavor "vintage" }}
     cluster_type: "management_cluster"
-{{- end }}
+    {{- end }}
   name: falco.rules
   namespace: {{ .Values.namespace  }}
 spec:
diff --git a/helm/prometheus-rules/values.schema.json b/helm/prometheus-rules/values.schema.json
index 780796c8..414afa24 100644
--- a/helm/prometheus-rules/values.schema.json
+++ b/helm/prometheus-rules/values.schema.json
@@ -30,14 +30,6 @@
                 }
             }
         },
-        "mimir": {
-            "type": "object",
-            "properties": {
-                "enabled": {
-                    "type": "boolean"
-                }
-            }
-        },
         "name": {
             "type": "string"
         },
diff --git a/helm/prometheus-rules/values.yaml b/helm/prometheus-rules/values.yaml
index 409130af..0388578c 100644
--- a/helm/prometheus-rules/values.yaml
+++ b/helm/prometheus-rules/values.yaml
@@ -10,9 +10,6 @@ managementCluster:
     flavor: ""
     region: ""
 
-mimir:
-  enabled: false
-
 Installation:
   V1:
     Guest:
diff --git a/mimir/update.sh b/mimir/update.sh
index 05ddd4bb..7980f939 100755
--- a/mimir/update.sh
+++ b/mimir/update.sh
@@ -36,7 +36,7 @@ spec:\
   groups:' "$OUTPUT_FILE"
 
 # Add the mimir enabled helm conditional blocks
-sed -i '1i{{- if .Values.mimir.enabled }}' "$OUTPUT_FILE"
+sed -i '1i{{- if eq .Values.managementCluster.provider.flavor "capi" }}' "$OUTPUT_FILE"
 sed -i -e '$a{{- end }}' "$OUTPUT_FILE"
 
 sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE"
diff --git a/test/conf/providers b/test/conf/providers
index c22316ae..5425cc44 100644
--- a/test/conf/providers
+++ b/test/conf/providers
@@ -1,4 +1,3 @@
 vintage/aws
 capi/capz
 capi/capa
-capi/capa-mimir
diff --git a/test/hack/bin/run-pint.sh b/test/hack/bin/run-pint.sh
index a5aa0150..84520c5e 100755
--- a/test/hack/bin/run-pint.sh
+++ b/test/hack/bin/run-pint.sh
@@ -15,9 +15,11 @@ main () {
     PINT_CONFIG="${1:-test/conf/pint/pint-config.hcl}"
 
     if [[ "${2:-}" != "" ]]; then
-        mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa-mimir/" | grep -v ".test.yml")
+        mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capa/" | grep -v ".test.yml")
+        mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/capi/capz/" | grep -v ".test.yml")
     else
-        mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa-mimir/ -name "*.rules.yml")
+        mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capa/ -name "*.rules.yml")
+        mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/capi/capz/ -name "*.rules.yml")
     fi
 
     test/hack/bin/pint -c "$PINT_CONFIG" lint "${PINT_FILES_LIST[@]}"
diff --git a/test/hack/bin/template-chart.sh b/test/hack/bin/template-chart.sh
index 57dd769d..5f9278a9 100755
--- a/test/hack/bin/template-chart.sh
+++ b/test/hack/bin/template-chart.sh
@@ -13,7 +13,6 @@ main() {
     echo "Templating chart for provider: $provider"
 
     [[ $provider =~ ([a-z]+)/([a-z]+)([-]*[a-z]*) ]]
-    [[ "${BASH_REMATCH[3]}" == "-mimir" ]] && mimir_enabled=true || mimir_enabled=false
 
     helm template \
       "$GIT_WORKDIR"/helm/prometheus-rules \
@@ -21,7 +20,6 @@ main() {
       --set="managementCluster.provider.kind=${BASH_REMATCH[2]}" \
       --set="managementCluster.name=myinstall" \
       --set="managementCluster.pipeline=stable" \
-      --set="mimir.enabled=$mimir_enabled" \
       --output-dir "$GIT_WORKDIR"/test/hack/output/helm-chart/"$provider"
 
     # Remove useless files for tests
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
deleted file mode 100644
index bd05e856..00000000
--- a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ /dev/null
@@ -1,216 +0,0 @@
----
-# These tests differ between prometheus and mimir installations: the resulting labels are different
-rule_files:
-- prometheus-agent.rules.yml
-
-tests:
-  # Tests for `PrometheusAgentShardsMissing` alert
-  - interval: 1m
-    input_series:
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_operator_spec_shards{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}'
-        values: '3+0x60 5+0x60 3+0x60'
-      - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}'
-        values: '1+0x180'
-    alert_rule_test:
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 40m
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 40m
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 120m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 100m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 125m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 125m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 130m
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 130m
-  # Tests for `PrometheusAgentShardsMissing` alert with missing `prometheus_operator_spec_shards` metric
-  - interval: 1m
-    input_series:
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-1-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_remote_storage_metadata_total{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus", instance="prometheus-agent", job="prometheus-agent", pod="prometheus-prometheus-agent-shard-2-0", remote_name="806b63", service="prometheus-agent", team="atlas", url="https://myinstall/mycluster/api/v1/write"}'
-        values: "10000+0x180"
-      - series: 'prometheus_operator_spec_replicas{cluster_id="test01", installation="myinstall", provider="aws", pipeline="testing", container="prometheus-operator-app", controller="prometheus", instance="prometheus-operator-app", job="prometheus-operator-app-operator", name="prometheus-agent", pod="prometheus-operator-app-operator-76b5899558-nz8h5", service="prometheus-operator-app-operator", team="atlas"}'
-        values: '3+0x60 5+0x60 3+0x60'
-    alert_rule_test:
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 40m
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 40m
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 120m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 100m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 125m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 125m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: test01
-              installation: myinstall
-              provider: aws
-              pipeline: testing
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_outside_working_hours: "true"
-            exp_annotations:
-              description: "Prometheus agent is missing shards."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent is missing shards."
-      - alertname: PrometheusAgentShardsMissing
-        eval_time: 130m
-      - alertname: InhibitionPrometheusAgentShardsMissing
-        eval_time: 130m
diff --git a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml b/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml
deleted file mode 100644
index 6b130ff8..00000000
--- a/test/tests/providers/capi/capa-mimir/platform/honeybadger/alerting-rules/zot.rules.test.yml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-rule_files:
-  - zot.rules.yml
-
-tests:
-  - interval: 1m
-    input_series:
-      - series: 'kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"}'
-        values: '_x5 0x10 1x45'
-    alert_rule_test:
-      - alertname: ZotDeploymentNotSatisfied
-        eval_time: 46m
-        exp_alerts:
-          - exp_labels:
-              alertname: "ZotDeploymentNotSatisfied"
-              area: "platform"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_outside_working_hours: "true"
-              cluster_type: "management_cluster"
-              deployment: "zot-zot"
-              namespace: "zot"
-              severity: "page"
-              team: "honeybadger"
-              topic: "managementcluster"
-            exp_annotations:
-              description: "Zot deployment zot/zot-zot is not satisfied."
-              opsrecipe: "zot/"
-  - interval: 1m
-    input_series:
-      - series: 'kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}'
-        values: '50x30 20x30 15x30 5x60'
-      - series: 'kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}'
-        values: '100x150'
-    alert_rule_test:
-      - alertname: ZotPersistentVolumeFillingUp
-        eval_time: 150m
-        exp_alerts:
-          - exp_labels:
-              alertname: "ZotPersistentVolumeFillingUp"
-              area: "platform"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-              cancel_if_cluster_status_updating: "true"
-              cancel_if_outside_working_hours: "true"
-              namespace: "zot"
-              persistentvolumeclaim: "zot-zot-pvc"
-              severity: "page"
-              team: "honeybadger"
-              topic: "managementcluster"
-            exp_annotations:
-              description: "The Zot PersistentVolume claimed by zot-zot-pvc in namespace zot is at least 80% full and projected to fill up soon."
-              opsrecipe: "zot/"
diff --git a/test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml b/test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml
similarity index 100%
rename from test/tests/providers/capi/capa-mimir/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml
rename to test/tests/providers/capi/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml
new file mode 100644
index 00000000..71be1d4c
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-cluster.rules.test.yml
@@ -0,0 +1,69 @@
+rule_files:
+  - capi-cluster.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}'
+        values: "1+0x75"
+      - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}'
+        values: "1+0x75"
+      - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="False", type="Ready"}'
+        values: "0+0x10 0+1x65"
+      - series: 'capi_cluster_status_condition{name="grumpy", exported_namespace="giantswarm", status="True", type="Ready"}'
+        values: "0+1x10 0+0x65"
+      - series: 'capi_cluster_annotation_paused{name="grumpy", exported_namespace="giantswarm", paused_value="true"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: ClusterUnhealthyPhase
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: page
+              team: phoenix
+              topic: managementcluster
+              name: clippaxy
+              exported_namespace: giantswarm
+              phase: Pending
+            exp_annotations:
+              description: "Cluster giantswarm/clippaxy stuck in Pending phase."
+              opsrecipe: capi-cluster/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: ClusterStatusNotReady
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              name: grumpy
+              exported_namespace: giantswarm
+              status: "False"
+              type: Ready
+            exp_annotations:
+              description: "Cluster giantswarm/grumpy is not ready."
+              opsrecipe: capi-cluster/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: ClusterPaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              name: grumpy
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "The cluster giantswarm/grumpy is paused."
+              opsrecipe: capi-cluster/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml
new file mode 100644
index 00000000..2bcb3c23
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-kubeadmcontrolplane.rules.test.yml
@@ -0,0 +1,52 @@
+rule_files:
+  - capi-kubeadmcontrolplane.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}'
+        values: "0+2x100"
+      - series: 'capi_kubeadmcontrolplane_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: KubeadmControlPlaneReplicasMismatch
+        eval_time: 100m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-72jzy
+              exported_namespace: giantswarm
+            exp_annotations:
+              description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes."
+              opsrecipe: capi-kubeadmcontrolplane/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: KubeadmControlPlanePaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: grumpy
+              name: grumpy-72r5c
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "The clusters grumpy kubeadmcontrolplane giantswarm/grumpy-72r5c is paused."
+              opsrecipe: capi-kubeadmcontrolplane/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml
new file mode 100644
index 00000000..e8560612
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machine.rules.test.yml
@@ -0,0 +1,49 @@
+rule_files:
+  - capi-machine.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}'
+        values: "1+0x10 0+0x35"
+      - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}'
+        values: "0+0x10 1+0x35"
+      - series: 'capi_machine_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: MachineUnhealthyPhase
+        eval_time: 45m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: page
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-72jq5
+              exported_namespace: giantswarm
+              phase: Failed
+            exp_annotations:
+              description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes."
+              opsrecipe: capi-machine/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: MachinePaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: grumpy
+              name: grumpy-72r5c
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "Machine giantswarm/grumpy-72r5c is paused."
+              opsrecipe: capi-machine/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml
new file mode 100644
index 00000000..9d9c1d91
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinedeployment.rules.test.yml
@@ -0,0 +1,47 @@
+rule_files:
+  - capi-machinedeployment.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_machinedeployment_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}'
+        values: "0+3x75"
+      - series: 'capi_machinedeployment_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: MachineDeploymentIsNotHealthy
+        eval_time: 25m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              phase: Failed
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-def00
+              exported_namespace: giantswarm
+            exp_annotations:
+              description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy."
+              opsrecipe: capi-machinedeployment/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: MachineDeploymentPaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: grumpy
+              name: grumpy-def99
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused."
+              opsrecipe: capi-machinedeployment/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml
new file mode 100644
index 00000000..70f51908
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machinepool.rules.test.yml
@@ -0,0 +1,47 @@
+rule_files:
+  - capi-machinepool.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}'
+        values: "0+3x75"
+      - series: 'capi_machinepool_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: MachinePoolIsNotHealthy
+        eval_time: 25m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: page
+              phase: Failed
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-def00
+              exported_namespace: giantswarm
+            exp_annotations:
+              description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy."
+              opsrecipe: capi-machinepool/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
+      - alertname: MachinePoolPaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: grumpy
+              name: grumpy-72r5c
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused."
+              opsrecipe: capi-machinepool/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml
new file mode 100644
index 00000000..d41639d8
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi-machineset.rules.test.yml
@@ -0,0 +1,27 @@
+rule_files:
+  - capi-machineset.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_machineset_annotation_paused{paused_value="true",cluster_name="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}'
+        values: "0+1x75"
+    alert_rule_test:
+      - alertname: MachineSetPaused
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_monitoring_agent_down: "true"
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: grumpy
+              name: grumpy-def99
+              exported_namespace: giantswarm
+              paused_value: "true"
+            exp_annotations:
+              description: "Machineset giantswarm/grumpy-def99 is paused."
+              opsrecipe: capi-machineset/
+              dashboard: bdi7iswg81czkcasd/capi-agregated-error-logs-for-capi-controllers
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml
new file mode 100644
index 00000000..c07f91b5
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/capi.rules.test.yml
@@ -0,0 +1,91 @@
+rule_files:
+  - capi.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}'
+        values: "1+0x10 0+0x35"
+      - series: 'capi_machine_status_phase{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}'
+        values: "0+0x10 1+0x35"
+    alert_rule_test:
+      - alertname: MachineUnhealthyPhase
+        eval_time: 45m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-72jq5
+              exported_namespace: giantswarm
+              phase: Failed
+            exp_annotations:
+              description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes."
+  - interval: 1m
+    input_series:
+      - series: 'capi_machinepool_status_phase{phase="Failed", cluster_name="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}'
+        values: "0+3x75"
+    alert_rule_test:
+      - alertname: MachinePoolIsNotHealthy
+        eval_time: 25m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-def00
+              exported_namespace: giantswarm
+            exp_annotations:
+              description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy."
+  - interval: 1m
+    input_series:
+      - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_spec_replicas{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}'
+        values: "0+3x100"
+      - series: 'capi_kubeadmcontrolplane_status_replicas_ready{cluster_name="clippaxy", name="clippaxy-72jzy", exported_namespace="giantswarm"}'
+        values: "0+2x100"
+    alert_rule_test:
+      - alertname: KubeadmControlPlaneReplicasMismatch
+        eval_time: 100m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              cluster_name: clippaxy
+              name: clippaxy-72jzy
+              exported_namespace: giantswarm
+            exp_annotations:
+              description: "The clusters clippaxy kubeadmcontrolplane giantswarm/clippaxy-72jzy does not match the expected number of replicas for longer than 90 minutes."
+  - interval: 1m
+    input_series:
+      - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Provisioned"}'
+        values: "1+0x75"
+      - series: 'capi_cluster_status_phase{name="clippaxy", exported_namespace="giantswarm", phase="Pending"}'
+        values: "1+0x75"
+    alert_rule_test:
+      - alertname: ClusterUnhealthyPhase
+        eval_time: 75m
+        exp_alerts:
+          - exp_labels:
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              severity: notify
+              team: phoenix
+              topic: managementcluster
+              name: clippaxy
+              exported_namespace: giantswarm
+              phase: Pending
+            exp_annotations:
+              description: "Cluster giantswarm/clippaxy is in a non healthy phase."
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml
new file mode 100644
index 00000000..078f75d7
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/certificate.all.rules.test.yml
@@ -0,0 +1,94 @@
+---
+rule_files:
+  - certificate.all.rules.yml
+
+tests:
+  # CertificateSecretWillExpireInLessThanTwoWeeks within 2 weeks of expiration
+  - interval: 1d
+    input_series:
+      - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}'
+        values: "2678400x60"
+    alert_rule_test:
+      - alertname: CertificateSecretWillExpireInLessThanTwoWeeks
+        eval_time: 20d
+        exp_alerts:
+          - exp_labels:
+              alertname: CertificateSecretWillExpireInLessThanTwoWeeks
+              app: cert-exporter-deployment
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              cluster_id: gollem
+              cluster_type: management_cluster
+              container: cert-exporter
+              customer: giantswarm
+              exported_namespace: giantswarm
+              instance: 10.0.0.0:1234
+              job: gollem-prometheus/workload-gollem/0
+              namespace: giantswarm
+              node: 10.0.0.0
+              organization: giantswarm
+              pod: cert-exporter-deployment-5c47b4c55c-49wt9
+              provider: aws
+              name: athena-certs-secret
+              installation: gollem
+              service_priority: highest
+              severity: page
+              secretkey: tls.crt
+              team: phoenix
+              topic: cert-manager
+            exp_annotations:
+              description: "Certificate stored in Secret giantswarm/athena-certs-secret on gollem will expire in less than two weeks."
+              opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/"
+  # CertificateSecretWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
+  - interval: 1d
+    input_series:
+      - series: 'cert_exporter_secret_not_after{app="cert-exporter-deployment", cluster_id="gollem", cluster_type="management_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", job="gollem-prometheus/workload-gollem/0", name="athena-certs-secret", namespace="giantswarm", exported_namespace="giantswarm", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-5c47b4c55c-49wt9", provider="aws", secretkey="tls.crt", service_priority="highest"}'
+        values: "2678400x60"
+    alert_rule_test:
+      - alertname: CertificateSecretWillExpireInLessThanTwoWeeks
+        eval_time: 10d
+  # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks within 2 weeks of expiration
+  - interval: 1d
+    input_series:
+      - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}'
+        values: "2678400x60"
+    alert_rule_test:
+      - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
+        eval_time: 20d
+        exp_alerts:
+          - exp_labels:
+              alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
+              app: cert-exporter-deployment
+              area: kaas
+              cancel_if_outside_working_hours: "true"
+              cluster_id: 12345
+              cluster_type: workload_cluster
+              container: cert-exporter
+              customer: giantswarm
+              exported_namespace: kube-system
+              instance: 10.0.0.0:1234
+              job: 12345-prometheus/workload-12345/0
+              namespace: kube-system
+              node: 10.0.0.0
+              organization: giantswarm
+              pod: cert-exporter-deployment-57bbbfd856-8r8dr
+              provider: aws
+              name: kiam-agent
+              installation: gollem
+              service_priority: highest
+              severity: page
+              team: phoenix
+              topic: cert-manager
+              issuer_ref: kiam-ca-issuer
+              managed_issuer: "true"
+            exp_annotations:
+              description: "Certificate CR kube-system/kiam-agent on 12345 will expire in less than two weeks."
+              opsrecipe: "managed-app-cert-manager/certificate-secret-will-expire-in-less-than-two-weeks/"
+  # GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks not within 2 weeks of expiration
+  - interval: 1d
+    input_series:
+      - series: 'cert_exporter_certificate_cr_not_after{app="cert-exporter-deployment", cluster_id="12345", cluster_type="workload_cluster", container="cert-exporter", customer="giantswarm", installation="gollem", instance="10.0.0.0:1234", issuer_ref="kiam-ca-issuer", job="12345-prometheus/workload-12345/0", managed_issuer="true", name="kiam-agent", namespace="kube-system", exported_namespace="kube-system", node="10.0.0.0", organization="giantswarm", pod="cert-exporter-deployment-57bbbfd856-8r8dr", provider="aws", service_priority="highest"}'
+        values: "2678400x60"
+    alert_rule_test:
+      - alertname: GiantswarmManagedCertificateCRWillExpireInLessThanTwoWeeks
+        eval_time: 10d
diff --git a/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml
new file mode 100644
index 00000000..786acc10
--- /dev/null
+++ b/test/tests/providers/capi/capa/kaas/turtles/alerting-rules/node-exporter.rules.test.yml
@@ -0,0 +1,38 @@
+---
+rule_files:
+  - node-exporter.rules.yml
+
+tests:
+  # NodeExporterCollectorFailed tests
+  - interval: 1m
+    input_series:
+      # No data for 20 minutes, then all good, then cpu collector fails, then bonding collector fails
+      - series: 'node_scrape_collector_success{app="node-exporter", collector="cpu", instance="10.0.5.111:10300"}'
+        values: "_x20 1+0x20 0+0x20 1+0x20"
+      - series: 'node_scrape_collector_success{app="node-exporter", collector="bonding", instance="10.0.5.111:10300"}'
+        values: "_x20 1+0x20 1+0x20 0+0x20"
+    alert_rule_test:
+      - alertname: NodeExporterCollectorFailed
+        eval_time: 10m
+      - alertname: NodeExporterCollectorFailed
+        eval_time: 30m
+      - alertname: NodeExporterCollectorFailed
+        eval_time: 50m
+        exp_alerts:
+          - exp_labels:
+              alertname: NodeExporterCollectorFailed
+              app: "node-exporter"
+              area: "kaas"
+              cancel_if_outside_working_hours: "true"
+              collector: "cpu"
+              instance: "10.0.5.111:10300"
+              severity: "page"
+              team: "phoenix"
+              topic: "observability"
+            exp_annotations:
+              description: "NodeExporter Collector cpu on 10.0.5.111:10300 is failed."
+              opsrecipe: "node-exporter-device-error/"
+      - alertname: NodeExporterCollectorFailed
+        eval_time: 70m
+
+
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml
similarity index 100%
rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/grafana-cloud.test.yml
rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/grafana-cloud.test.yml
diff --git a/test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml
similarity index 100%
rename from test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml
rename to test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml
diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index f539b234..bd05e856 100644
--- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -4,98 +4,6 @@ rule_files:
 - prometheus-agent.rules.yml
 
 tests:
-  # Tests for `PrometheusAgentFailing` alert
-  - interval: 1m
-    input_series:
-      - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}'
-        values: "_x60  0+0x60 1+0x60"
-      - series: 'capi_cluster_status_condition{cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}'
-        values: "1+0x180"
-    alert_rule_test:
-      - alertname: PrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              instance: prometheus-agent
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              instance: prometheus-agent
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              instance: prometheus-agent
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              instance: prometheus-agent
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 150m
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 150m
   # Tests for `PrometheusAgentShardsMissing` alert
   - interval: 1m
     input_series:
diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml
new file mode 100644
index 00000000..00167d08
--- /dev/null
+++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/cert-manager.rules.test.yml
@@ -0,0 +1,46 @@
+---
+rule_files:
+  - cert-manager.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
+        values: "0+0x60"
+    alert_rule_test:
+      - alertname: CertManagerDown
+        eval_time: 15m
+        exp_alerts:
+          - exp_labels:
+              alertname: CertManagerDown
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_kubelet_down: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: 12345
+              cluster_type: workload_cluster
+              container: cert-manager
+              customer: giantswarm
+              instance: 10.0.0.0:1234
+              ip: 10.0.0.0
+              job: 12345-prometheus/workload-12345/0
+              namespace: kube-system
+              organization: giantswarm
+              pod: cert-manager-controller-7fcc585578-gnprd
+              provider: capa
+              installation: golem
+              service_priority: highest
+              severity: page
+              team: shield
+              topic: cert-manager
+            exp_annotations:
+              description: "cert-manager in namespace kube-system is down."
+              opsrecipe: "cert-manager-down/"
+  - interval: 1m
+    input_series:
+      - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
+        values: "1+0x60"
+    alert_rule_test:
+      - alertname: CertManagerDown
+        eval_time: 15m
diff --git a/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml
new file mode 100644
index 00000000..2ab1f7c2
--- /dev/null
+++ b/test/tests/providers/capi/capa/platform/shield/alerting-rules/teleport.rules.test.yml
@@ -0,0 +1,59 @@
+---
+rule_files:
+  - 'teleport.rules.yml'
+
+tests:
+  - interval: 1m
+    input_series:
+    - series: 'kube_secret_created{cluster_id="my-cluster", installation="golem", secret="my-cluster-teleport-join-token"}'
+      values: "1+0x150"
+    - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="golem", phase="Provisioned"}'
+      values: "2+0x150"
+    alert_rule_test:
+      - alertname: TeleportJoinTokenSecretMismatch
+        eval_time: 30m
+        exp_alerts: []
+      - alertname: TeleportJoinTokenSecretMismatch
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              alertname: TeleportJoinTokenSecretMismatch
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: my-cluster
+              installation: golem
+              severity: notify
+              team: shield
+              topic: teleport
+            exp_annotations:
+              description: "Mismatch in number of teleport-join-token secrets and clusters"
+  - interval: 1m
+    input_series:
+      - series: 'kube_configmap_info{cluster_id="my-cluster", installation="grizzly", configmap="my-cluster-teleport-kube-agent-config"}'
+        values: "1+0x150"
+      - series: 'capi_cluster_status_phase{cluster_id="my-cluster", installation="grizzly", phase="Provisioned"}'
+        values: "2+0x150"
+    alert_rule_test:
+      - alertname: TeleportKubeAgentConfigMapMismatch
+        eval_time: 30m
+        exp_alerts: []
+      - alertname: TeleportKubeAgentConfigMapMismatch
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              alertname: TeleportKubeAgentConfigMapMismatch
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: my-cluster
+              installation: grizzly
+              severity: notify
+              team: shield
+              topic: teleport
+            exp_annotations:
+              description: "Mismatch in number of teleport-kube-agent-config secrets and clusters"
diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml
new file mode 100644
index 00000000..79c5aa0f
--- /dev/null
+++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/grafana-cloud.test.yml
@@ -0,0 +1,156 @@
+---
+rule_files:
+- grafana-cloud.rules.yml
+
+tests:
+  # Tests for `MimirToGrafanaCloudExporterDown` alert
+  - interval: 1m
+    input_series:
+      - series: 'up{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", customer="giantswarm", pipeline="stable", provider="capa", region="eu-west-2"}'
+        values: "_x60 1+0x60 0+0x60 1+0x60"
+    alert_rule_test:
+      - alertname: MimirToGrafanaCloudExporterDown
+        eval_time: 50m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: myinstall
+              cluster_type: management_cluster
+              installation: myinstall
+              job: mimir/mimir-to-grafana-cloud
+              pipeline: stable
+              provider: capa
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud"
+              description: "Prometheus Mimir to Grafana-Cloud is down."
+              opsrecipe: "mimir-grafana-cloud-exporter-failing/"
+      - alertname: MimirToGrafanaCloudExporterDown
+        eval_time: 70m
+      - alertname: MimirToGrafanaCloudExporterDown
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: myinstall
+              cluster_type: management_cluster
+              customer: giantswarm
+              installation: myinstall
+              job: mimir/mimir-to-grafana-cloud
+              namespace: mimir
+              pipeline: stable
+              provider: capa
+              region: eu-west-2
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: "iWowmlSmk/prometheus?var-cluster=mimir-to-grafana-cloud"
+              description: "Prometheus Mimir to Grafana-Cloud is down."
+              opsrecipe: "mimir-grafana-cloud-exporter-failing/"
+      - alertname: MimirToGrafanaCloudExporterDown
+        eval_time: 200m
+  # Tests for `MimirToGrafanaCloudExporterFailures` alert
+  - interval: 1m
+    input_series:
+      # remote read is working for 2 hours and then fails for 1 hour
+      - series: 'prometheus_remote_storage_read_queries_total{code="200", job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x60 0+10x60 0+0x60 0+10x180"
+      # remote write has no failure for 4 hours and then fails for 2 hours
+      - series: 'prometheus_remote_storage_samples_failed_total{job="mimir/mimir-to-grafana-cloud", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x60 0+0x180 0+10x120"
+    alert_rule_test:
+      - alertname: MimirToGrafanaCloudExporterFailures
+        eval_time: 70m
+      - alertname: MimirToGrafanaCloudExporterFailures
+        eval_time: 160m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: "myinstall"
+              installation: "myinstall"
+              pipeline: "testing"
+              provider: "capa"
+            exp_annotations:
+              dashboard: "promRW001/prometheus-remote-write"
+              description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."
+              opsrecipe: "mimir-grafana-cloud-exporter-failing/"
+      - alertname: MimirToGrafanaCloudExporterFailures
+        eval_time: 200m
+      - alertname: MimirToGrafanaCloudExporterFailures
+        eval_time: 280m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: "myinstall"
+              installation: "myinstall"
+              pipeline: "testing"
+              provider: "capa"
+            exp_annotations:
+              dashboard: "promRW001/prometheus-remote-write"
+              description: "Prometheus Mimir to Grafana-Cloud is failing to read or write data."
+              opsrecipe: "mimir-grafana-cloud-exporter-failing/"
+  # Tests for `MimirToGrafanaCloudExporterTooManyRestarts` alert
+  - interval: 1m
+    input_series:
+      # remote read is working for 2 hours and then fails for 1 hour
+      - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea5", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x60 1+0x60 _x80"
+      - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea6", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x122 1+0x2 _x78"
+      - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea7", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x124 1+0x2 _x76"
+      - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea8", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x126 1+0x2 _x74"
+      - series: 'kube_pod_status_ready{condition="true", uid="0bb4e0cc-12df-4085-8d39-8e08b9c64ea9", pod="prometheus-mimir-to-grafana-cloud-0", cluster_id="myinstall", customer="giantswarm", installation="myinstall", namespace="mimir", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "_x128 1+0x72"
+    alert_rule_test:
+      - alertname: MimirToGrafanaCloudExporterTooManyRestarts
+        eval_time: 70m
+      - alertname: MimirToGrafanaCloudExporterTooManyRestarts
+        eval_time: 140m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              pod: "prometheus-mimir-to-grafana-cloud-0"
+              cluster_id: "myinstall"
+              installation: "myinstall"
+              pipeline: "testing"
+              provider: "capa"
+            exp_annotations:
+              dashboard: "promRW001/prometheus-remote-write"
+              description: "Prometheus Mimir to Grafana-Cloud is restarting too much."
+              opsrecipe: "mimir-grafana-cloud-exporter-failing/"
+      - alertname: MimirToGrafanaCloudExporterTooManyRestarts
+        eval_time: 180m
diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml
new file mode 100644
index 00000000..6bdfeaea
--- /dev/null
+++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml
@@ -0,0 +1,392 @@
+---
+rule_files:
+  - mimir.rules.yml
+
+tests:
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: up, none, up, down, up
+      - series: 'up{job="mimir/ingester", container="ingester"}'
+        values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30"
+    alert_rule_test:
+      - alertname:  Heartbeat
+        eval_time: 20m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              job: mimir/ingester
+              container: ingester
+              installation: myinstall
+              team: atlas
+              topic: observability
+              type: mimir-heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
+              opsrecipe: "mimir/"
+      - alertname:  Heartbeat
+        eval_time: 70m
+      - alertname:  Heartbeat
+        eval_time: 95m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              job: mimir/ingester
+              container: ingester
+              installation: myinstall
+              team: atlas
+              topic: observability
+              type: mimir-heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
+              opsrecipe: "mimir/"
+      - alertname:  Heartbeat
+        eval_time: 140m
+      - alertname:  Heartbeat
+        eval_time: 165m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              job: mimir/ingester
+              container: ingester
+              installation: myinstall
+              team: atlas
+              topic: observability
+              type: mimir-heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
+              opsrecipe: "mimir/"
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: none, up, down
+      - series: 'up{job="mimir/ingester", container="ingester", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="aws", pipeline="testing", service="mimir-ingester"}'
+        values: "_x20 1+0x20 0+0x20"
+    alert_rule_test:
+      - alertname:  MimirComponentDown
+        eval_time: 10m
+      - alertname:  MimirComponentDown
+        eval_time: 30m
+      - alertname:  MimirComponentDown
+        eval_time: 50m
+        exp_alerts:
+          - exp_labels:
+              service: mimir-ingester
+              area: platform
+              severity: page
+              team: atlas
+              topic: observability
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: gauss
+              installation: gauss
+              provider: aws
+              pipeline: testing
+            exp_annotations:
+              dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
+              description: "Mimir component : mimir-ingester is down."
+              opsrecipe: "mimir/"
+  - interval: 1m
+    input_series:
+      # test: none, rate > 0, rate = 0
+      - series: 'mimir_rules_events_failed_total{cluster_type="management_cluster", cluster_id="golem", installation="golem", namespace="mimir"}'
+        values: "_x20 1+1x80 0+0x70"
+    alert_rule_test:
+      - alertname: MimirRulerEventsFailed
+        eval_time: 40m
+      - alertname: MimirRulerEventsFailed
+        eval_time: 95m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cluster_id: golem
+              cluster_type: management_cluster
+              installation: golem
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: 631e15d5d85afb2ca8e35d62984eeaa0/mimir-ruler
+              description: "Mimir ruler is failing to process PrometheusRules."
+              opsrecipe: "mimir/"
+      - alertname: MimirRulerEventsFailed
+        eval_time: 160m
+  - interval: 1m
+    input_series:
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}'
+        values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}'
+        values: "0+5x180"                 # prometheus container restarts 5 times per minute for 180 minutes
+    alert_rule_test:
+      - alertname: MimirRestartingTooOften
+        eval_time: 15m  # should be OK after 15 minutes
+      - alertname: MimirRestartingTooOften
+        eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
+        exp_alerts:
+          - exp_labels:
+              all_pipelines: "true"
+              area: platform
+              cancel_if_outside_working_hours: "true"
+              cluster_type: management_cluster
+              container: mimir-ingester
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: ffcd83628d7d4b5a03d1cafd159e6c9c/mimir-overview
+              description: Mimir containers are restarting too often.
+              opsrecipe: "mimir/"
+      - alertname: MimirRestartingTooOften
+        eval_time: 140m  # After 140m minutes, all should be back to normal
+  # Test for MimirIngesterNeedsToBeScaledUp alert
+  - interval: 1m
+    input_series:
+      # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests.
+      - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60"
+      - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60"
+      # mimir-ingester memory requests stay the same for the entire duration of the test.
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "12+0x400"
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "12+0x400"
+      # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests.                              
+      - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "0+60x100 6000+110x70 10400+60x60 14000+110x70 18400+60x60"
+      - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "0+60x400"
+      # mimir-ingester cpu requests stay the same for the entire duration of the test.
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "1.5+0x400"                                 
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "1.5+0x400"                                 
+    alert_rule_test:
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 15m
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 85m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir ingester is consuming too much resources and needs to be scaled up.
+              opsrecipe: "mimir-ingester/"
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 130m
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 170m 
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir ingester is consuming too much resources and needs to be scaled up.
+              opsrecipe: "mimir-ingester/"
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 210m
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 295m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir ingester is consuming too much resources and needs to be scaled up.
+              opsrecipe: "mimir-ingester/"
+      - alertname: MimirIngesterNeedsToBeScaledUp
+        eval_time: 350m
+  # Test for MimirIngesterNeedsToBeScaledDown alert
+  - interval: 1m
+    input_series:
+      # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests.
+      - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60"
+      - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60"
+      # mimir-ingester memory requests stay the same for the entire duration of the test.
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "12+0x300"
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "12+0x300"       
+      # mimir-ingester real cpu usage gradually increases until it goes below 30% of the cpu requests.                        
+      - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "0+60x100 6000+10x40 6400+60x60 10000+10x40 10400+60x60"
+      - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "0+30x300"
+      # mimir-ingester cpu requests stay the same for the entire duration of the test
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "1.5+0x300"                                 
+      - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}'
+        values: "1.5+0x300"                                 
+    alert_rule_test:
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 15m 
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 55m 
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 100m
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 135m
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 180m 
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 240m 
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir ingester is consuming very few resources and needs to be scaled down.
+              opsrecipe: "mimir-ingester/"
+      - alertname: MimirIngesterNeedsToBeScaledDown
+        eval_time: 280m 
+  # Test for MimirHPAReachedMaxReplicas alert
+  - interval: 1m
+    input_series:
+      # HPA max replicas = 3 for the whole test
+      # HPA target metric = 90% for the whole test
+      # Cases:
+      #   desired_replicas < max_replicas AND current_utilization < target_utilization does not fire
+      #   desired_replicas < max_replicas AND current_utilization = target_utilization does not fire
+      #   desired_replicas < max_replicas AND current_utilization > target_utilization does not fire
+      #   desired_replicas = max_replicas AND current_utilization < target_utilization does not fire
+      #   desired_replicas = max_replicas AND current_utilization = target_utilization does not fire
+      #   desired_replicas = max_replicas AND current_utilization > target_utilization does fire
+      #   desired_replicas > max_replicas AND current_utilization < target_utilization does not fire
+      #   desired_replicas > max_replicas AND current_utilization = target_utilization does not fire
+      #   desired_replicas > max_replicas AND current_utilization > target_utilization does fire
+      - series: 'kube_horizontalpodautoscaler_spec_max_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}'
+        values: '3+0x360'
+      - series: 'kube_horizontalpodautoscaler_status_desired_replicas{horizontalpodautoscaler="mimir-distributor", namespace="mimir"}'
+        values: '2+0x120 3+0x120 4+0x120'
+      - series: 'kube_horizontalpodautoscaler_spec_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}'
+        values: '90+0x360'
+      # HPA current metric = 80% for 10mn, then increase to 90% for 10mn
+      - series: 'kube_horizontalpodautoscaler_status_target_metric{horizontalpodautoscaler="mimir-distributor", namespace="mimir", metric_name="cpu", metric_target_type="utilization"}'
+        values: '80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40 80+0x40 90+0x40 100+0x40'
+    alert_rule_test:
+      - alertname: MimirHPAReachedMaxReplicas
+        eval_time: 234m
+      - alertname: MimirHPAReachedMaxReplicas
+        eval_time: 235m
+        exp_alerts:
+          -  exp_labels:
+               area: platform
+               cancel_if_cluster_status_creating: "true"
+               cancel_if_cluster_status_deleting: "true"
+               cancel_if_cluster_status_updating: "true"
+               cancel_if_outside_working_hours: "true"
+               severity: page
+               team: atlas
+               topic: observability
+               horizontalpodautoscaler: mimir-distributor
+               namespace: mimir
+             exp_annotations:
+               description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
+               opsrecipe: "mimir-hpa/"
+      - alertname: MimirHPAReachedMaxReplicas
+        eval_time: 246m
+      - alertname: MimirHPAReachedMaxReplicas
+        eval_time: 360m
+        exp_alerts:
+          -  exp_labels:
+               area: platform
+               cancel_if_cluster_status_creating: "true"
+               cancel_if_cluster_status_deleting: "true"
+               cancel_if_cluster_status_updating: "true"
+               cancel_if_outside_working_hours: "true"
+               severity: page
+               team: atlas
+               topic: observability
+               horizontalpodautoscaler: mimir-distributor
+               namespace: mimir
+             exp_annotations:
+               description: "Mimir mimir-distributor HPA has reached maximum replicas and consume too much resources, it needs to be scaled up."
+               opsrecipe: "mimir-hpa/"
+  # Test for MimirCompactorFailedCompaction alert
+  - interval: 1m
+    input_series:
+      - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}'
+        values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190"
+    alert_rule_test:
+      - alertname: MimirCompactorFailedCompaction
+        eval_time: 15m 
+      - alertname: MimirCompactorFailedCompaction
+        eval_time: 55m 
+      - alertname: MimirCompactorFailedCompaction
+        eval_time: 120m
+        exp_alerts:
+          - exp_labels:
+              area: platform
+              cancel_if_cluster_status_creating: "true"
+              cancel_if_cluster_status_deleting: "true"
+              cancel_if_cluster_status_updating: "true"
+              cancel_if_outside_working_hours: "true"
+              cluster_id: golem
+              installation: "golem"
+              pipeline: "testing"
+              provider: "capa"
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources
+              description: Mimir compactor has been failing its compactions for 2 hours.
+              opsrecipe: "mimir#mimircompactorfailedcompaction"
+      - alertname: MimirCompactorFailedCompaction
+        eval_time: 205m 
+      - alertname: MimirCompactorFailedCompaction
+        eval_time: 350m
diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
index e8ec8134..bd05e856 100644
--- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
+++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/prometheus-agent.rules.test.yml
@@ -4,98 +4,6 @@ rule_files:
 - prometheus-agent.rules.yml
 
 tests:
-  # Tests for `PrometheusAgentFailing` alert
-  - interval: 1m
-    input_series:
-      - series: 'up{instance="prometheus-agent",cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", job="prometheus-agent"}'
-        values: "_x60  0+0x60 1+0x60"
-      - series: 'capi_cluster_status_condition{ cluster_id="gauss", cluster_type="workload_cluster", installation="myinstall", customer="giantswarm", pipeline="testing", provider="capa", region="eu-west-2", status="True", type="ControlPlaneReady", name="gauss"}'
-        values: "1+0x180"
-    alert_rule_test:
-      - alertname: PrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              instance: prometheus-agent
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 30m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              instance: prometheus-agent
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: page
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              instance: prometheus-agent
-              cancel_if_cluster_has_no_workers: "true"
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 90m
-        exp_alerts:
-          - exp_labels:
-              area: platform
-              cluster_id: gauss
-              cluster_type: workload_cluster
-              severity: none
-              team: atlas
-              topic: observability
-              inhibit_monitoring_agent_down: "true"
-              installation: myinstall
-              instance: prometheus-agent
-              cancel_if_cluster_is_not_running_monitoring_agent: "true"
-              cancel_if_cluster_status_creating: "true"
-              cancel_if_cluster_status_deleting: "true"
-            exp_annotations:
-              dashboard: "promRW001/prometheus-remote-write"
-              description: "Prometheus agent remote write is failing."
-              opsrecipe: "prometheus-agent/"
-              summary: "Prometheus agent fails to send samples to remote write endpoint."
-      - alertname: PrometheusAgentFailing
-        eval_time: 150m
-      - alertname: InhibitionPrometheusAgentFailing
-        eval_time: 150m
   # Tests for `PrometheusAgentShardsMissing` alert
   - interval: 1m
     input_series: