From ea2f7c1403733dff5925227f4ba80983c503a84e Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Tue, 2 Apr 2024 11:42:12 +0200
Subject: [PATCH] Add heartbeat alert for mimir (#1094)

* Add heartbeat alert for mimir

* Add mimir.enabled flag
---
 CHANGELOG.md                                  |  1 +
 .../alerting-rules/inhibit.all.rules.yml      |  4 +-
 .../templates/alerting-rules/mimir.rules.yml  | 14 ++++++
 .../recording-rules/grafana-cloud.rules.yml   |  2 +-
 .../recording-rules/mimir-mixins.rules.yml    |  2 +
 test/hack/bin/check-opsrecipes.sh             |  2 +-
 .../providers/global/mimir.rules.test.yml     | 49 +++++++++++++++++++
 7 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92e66a423..2af5ec125 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Add Heartbeat alert for mimir.
 - Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
 - Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
 - Add recording rule for ingresses using the baseDomain.
diff --git a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
index 318ae2c44..6d26e61e8 100644
--- a/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/inhibit.all.rules.yml
@@ -28,7 +28,7 @@ spec:
         topic: kubernetes
       annotations:
         description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
-    # TODO: fix with real expr 
+    # TODO(@team-turtles): fix with real expr
     - alert: ScrapeTimeout
       annotations:
         description: '{{`Never fires (dummy alert).`}}'
@@ -38,7 +38,7 @@ spec:
         scrape_timeout: "true"
         team: phoenix
         topic: monitoring
-    # TODO: fix with real expr
+    # TODO(@team-turtles): fix with real expr
     - alert: ApiServerDown
       annotations:
         description: '{{`Never fires (dummy alert).`}}'
diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
index 9b1575dd1..15d5b7198 100644
--- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
@@ -1,3 +1,4 @@
+{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -9,6 +10,18 @@ spec:
   groups:
   - name: mimir
     rules:
+    # This alert is meant to always fire, to ensure the entire alerting pipeline is functional.
+    - alert: "Heartbeat"
+      annotations:
+        description: This alert is used to ensure the entire alerting pipeline is functional.
+      expr: up{app="mimir"} > 0
+      labels:
+        area: "empowerment"
+        installation: {{ .Values.managementCluster.name }}
+        # TODO(@team-atlas): We need this label as long as we have the old and new heartbeats. Let's remove once the legacy monitoring is gone
+        type: "mimir-heartbeat"
+        team: "atlas"
+        topic: "observability"
     # Coming from https://github.com/giantswarm/giantswarm/issues/30124
     # This alert ensures Mimir containers are not restarting too often (flappiness).
     # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
@@ -71,3 +84,4 @@ spec:
         severity: page
         team: atlas
         topic: observability
+{{- end }}
diff --git a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml
index 0d521c096..a192c2ad7 100644
--- a/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml
+++ b/helm/prometheus-rules/templates/recording-rules/grafana-cloud.rules.yml
@@ -383,7 +383,7 @@ spec:
     # Dex operator metrics for expiry time of identity provider oauth app secrets 
     - expr: dex_operator_idp_secret_expiry_time
       record: aggregation:dex_operator_idp_secret_expiry_time
-    # Requests to the deprecated k8s authenticator. TODO: Get rid of this recording rule when the component is no longer used.
+    # Requests to the deprecated k8s authenticator. TODO(@team-bigmac): Get rid of this recording rule when the component is no longer used.
     - expr: nginx_ingress_controller_requests{ingress="dex-k8s-authenticator"}
       record: aggregation:dex_k8s_authenticator_requests
   - name: grafana.grafana-cloud.recording
diff --git a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml
index e616271dc..b84038a46 100644
--- a/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml
+++ b/helm/prometheus-rules/templates/recording-rules/mimir-mixins.rules.yml
@@ -1,3 +1,4 @@
+{{- if .Values.mimir.enabled }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
@@ -577,3 +578,4 @@ spec:
     - expr: |
         sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))
       record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m
+{{- end }}
diff --git a/test/hack/bin/check-opsrecipes.sh b/test/hack/bin/check-opsrecipes.sh
index 1b72d1de7..af7384688 100755
--- a/test/hack/bin/check-opsrecipes.sh
+++ b/test/hack/bin/check-opsrecipes.sh
@@ -2,7 +2,7 @@
 set -euo pipefail
 
 # List of generated rules
-RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/*)
+RULES_FILES=(./test/hack/output/*/*/prometheus-rules/templates/alerting-rules/*)
 #RULES_FILES=(./test/hack/output/*/prometheus-rules/templates/alerting-rules/up*)
 
 DEBUG_MODE=false
diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml
index a5799b3a8..d67e708c4 100644
--- a/test/tests/providers/global/mimir.rules.test.yml
+++ b/test/tests/providers/global/mimir.rules.test.yml
@@ -3,6 +3,55 @@ rule_files:
   - mimir.rules.yml
 
 tests:
+  - interval: 1m
+    input_series:
+      # For the first 60min: test with 1 pod: up, none, up, down, up
+      - series: 'up{app="mimir"}'
+        values: "1+0x60 _x30 1+0x30 0+0x30 1+0x30"
+    alert_rule_test:
+      - alertname:  Heartbeat
+        eval_time: 20m
+        exp_alerts:
+          - exp_labels:
+              app: mimir
+              area: empowerment
+              installation: myinstall
+              namespace: monitoring
+              team: atlas
+              topic: observability
+              type: heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
+      - alertname:  Heartbeat
+        eval_time: 70m
+      - alertname:  Heartbeat
+        eval_time: 95m
+        exp_alerts:
+          - exp_labels:
+              app: mimir
+              area: empowerment
+              installation: myinstall
+              namespace: monitoring
+              team: atlas
+              topic: observability
+              type: heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
+      - alertname:  Heartbeat
+        eval_time: 140m
+      - alertname:  Heartbeat
+        eval_time: 165m
+        exp_alerts:
+          - exp_labels:
+              app: mimir
+              area: empowerment
+              installation: myinstall
+              namespace: monitoring
+              team: atlas
+              topic: observability
+              type: heartbeat
+            exp_annotations:
+              description: "This alert is used to ensure the entire alerting pipeline is functional."
   - interval: 1m
     input_series:
       # For the first 60min: test with 1 pod: none, up, down