From 893f0b806a43133118967b890a8db3a64c0830a9 Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 28 Mar 2024 17:00:19 +0100
Subject: [PATCH 1/2] =?UTF-8?q?Alert=20when=20mimir=20components=20are=20r?=
 =?UTF-8?q?estarting=20too=20often=20accross=20all=20pipe=E2=80=A6=20(#109?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Alert when mimir components are restarting too often accross all pipelines to avoid high storage cost

* Ignore the prometheus-buddy

* Improve tests
---
 CHANGELOG.md                                  |  4 +++
 .../templates/alerting-rules/mimir.rules.yml  | 24 +++++++++++---
 .../providers/global/mimir.rules.test.yml     | 32 ++++++++++++++++---
 3 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3cf111ede..203098375 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
+
 ## [3.5.0] - 2024-03-27
 
 ### Changed
diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
index 9d0c4f8b0..9b1575dd1 100644
--- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
@@ -9,6 +9,26 @@ spec:
   groups:
   - name: mimir
     rules:
+    # Coming from https://github.com/giantswarm/giantswarm/issues/30124
+    # This alert ensures Mimir containers are not restarting too often (flappiness).
+    # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
+    # This alert will not page for the prometheus-buddy.
+    - alert: MimirRestartingTooOften
+      annotations:
+        description: '{{`Mimir containers are restarting too often.`}}'
+      expr: |
+        increase(
+          kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container!="prometheus"}[1h]
+        ) > 5
+      for: 5m
+      labels:
+        area: managedservices
+        # This label is used to ensure the alert go through even for non-stable installations
+        all_pipelines: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
     - alert: MimirComponentDown
       annotations:
         description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
@@ -16,11 +36,9 @@ spec:
       for: 5m
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
@@ -32,7 +50,6 @@ spec:
       for: 1h
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -47,7 +64,6 @@ spec:
       for: 1h
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml
index f46a2b21d..a5799b3a8 100644
--- a/test/tests/providers/global/mimir.rules.test.yml
+++ b/test/tests/providers/global/mimir.rules.test.yml
@@ -22,11 +22,9 @@ tests:
               severity: page
               team: atlas
               topic: observability
-              cancel_if_apiserver_down: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
-              cancel_if_scrape_timeout: "true"
               cancel_if_outside_working_hours: "true"
               cluster_id: gauss
             exp_annotations:
@@ -46,7 +44,6 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
-              cancel_if_apiserver_down: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -70,7 +67,6 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
-              cancel_if_apiserver_down: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -86,3 +82,31 @@ tests:
               description: "Mimir ruler is failing to process PrometheusRules."
       - alertname: MimirRulerEventsFailed
         eval_time: 160m
+  - interval: 1m
+    input_series:
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}'
+        values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}'
+        values: "0+5x180"                 # prometheus container restarts 5 times per minute for 180 minutes
+    alert_rule_test:
+      - alertname: MimirRestartingTooOften
+        eval_time: 15m  # should be OK after 15 minutes
+        exp_alerts:
+      - alertname: MimirRestartingTooOften
+        eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
+        exp_alerts:
+          - exp_labels:
+              all_pipelines: true
+              area: managedservices
+              cancel_if_outside_working_hours: "true"
+              cluster_type: management_cluster
+              container: mimir-ingester
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir containers are restarting too often.
+      - alertname: MimirRestartingTooOften
+        eval_time: 140m  # After 140m minutes, all should be back to normal
+        exp_alerts:

From 7c98e67e2aa7f909431253ef1cfcde3af27883ea Mon Sep 17 00:00:00 2001
From: Quentin Bisson <quentin@giantswarm.io>
Date: Thu, 28 Mar 2024 17:02:29 +0100
Subject: [PATCH 2/2] Add loki not running alert to avoid high cloud costs
 (#1090)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add loki not running alert to avoid high cloud costs

* Use pod container restarts instead of statefulset available

* Update test/tests/providers/global/loki.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update CHANGELOG.md

* Update test/tests/providers/global/loki.rules.test.yml

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
---
 CHANGELOG.md                                  |  4 ++-
 README.md                                     |  4 +++
 .../{loki.all.rules.yml => loki.rules.yml}    | 22 ++++++++++++++-
 ...all.rules.test.yml => loki.rules.test.yml} | 28 ++++++++++++++++++-
 4 files changed, 55 insertions(+), 3 deletions(-)
 rename helm/prometheus-rules/templates/alerting-rules/{loki.all.rules.yml => loki.rules.yml} (74%)
 rename test/tests/providers/global/{loki.all.rules.test.yml => loki.rules.test.yml} (83%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 203098375..d62f33b5a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-### Changed
+### Added
+
 
+- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
 - Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
 
 ## [3.5.0] - 2024-03-27
diff --git a/README.md b/README.md
index a016b43e1..d485a7718 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,10 @@ Any Alert includes:
    - `cancel_if_.*`
 
 
+### Specific alert labels
+
+- `all_pipelines: true`: When adding this label to an alert, you are sure the alert will be send to opsgenie, even if the installation is not a stable installation.
+
 #### Routing
 
 Alertmanager does the routing based on the labels menitoned above.
diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml
similarity index 74%
rename from helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml
rename to helm/prometheus-rules/templates/alerting-rules/loki.rules.yml
index 9eb724803..9f87870bf 100644
--- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml
+++ b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml
@@ -3,12 +3,32 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: grafana.all.rules
+  name: loki.rules
   namespace: {{ .Values.namespace }}
 spec:
   groups:
   - name: loki
     rules:
+    # Coming from https://github.com/giantswarm/giantswarm/issues/30124
+    # This alert ensures Loki containers are not restarting too often (flappiness).
+    # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
+    - alert: LokiRestartingTooOften
+      annotations:
+        description: '{{`Loki containers are restarting too often.`}}'
+        opsrecipe: loki/
+      expr: |
+        increase(
+          kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}[1h]
+        ) > 5
+      for: 5m
+      labels:
+        area: managedservices
+        # This label is used to ensure the alert go through even for non-stable installations
+        all_pipelines: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
     # Rules inspired from loki-mixins - https://github.com/grafana/loki/blob/main/production/loki-mixin-compiled/alerts.yaml
     - alert: LokiRequestErrors
       annotations:
diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.rules.test.yml
similarity index 83%
rename from test/tests/providers/global/loki.all.rules.test.yml
rename to test/tests/providers/global/loki.rules.test.yml
index 03bb95fe6..d3cfab8b9 100644
--- a/test/tests/providers/global/loki.all.rules.test.yml
+++ b/test/tests/providers/global/loki.rules.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-  - loki.all.rules.yml
+  - loki.rules.yml
 
 tests:
   - interval: 1m
@@ -98,3 +98,29 @@ tests:
             exp_annotations:
               description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members"
               opsrecipe: "loki/"
+  - interval: 1m
+    input_series:
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}'
+        values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
+    alert_rule_test:
+      - alertname: LokiRestartingTooOften
+        eval_time: 15m  # should be OK after 15 minutes
+        exp_alerts:
+      - alertname: LokiRestartingTooOften
+        eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
+        exp_alerts:
+          - exp_labels:
+              all_pipelines: true
+              area: managedservices
+              cancel_if_outside_working_hours: "true"
+              cluster_type: management_cluster
+              namespace: loki
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Loki containers are restarting too often.
+              opsrecipe: loki/
+      - alertname: LokiRestartingTooOften
+        eval_time: 140m  # After 140m minutes, all should be back to normal
+        exp_alerts: