Merge branch 'master' into add-mimir-heartbeart-alert

giantswarm · Mar 28, 2024 · 31301ab · 31301ab
2 parents 897e3a6 + 7c98e67
commit 31301ab
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Add Heartbeat alert for mimir.
+- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
+- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124).
 
 ## [3.5.0] - 2024-03-27
 

diff --git a/README.md b/README.md
@@ -60,6 +60,10 @@ Any Alert includes:
    - `cancel_if_.*`
 
 
+### Specific alert labels
+
+- `all_pipelines: true`: When adding this label to an alert, you are sure the alert will be send to opsgenie, even if the installation is not a stable installation.
+
 #### Routing
 
 Alertmanager does the routing based on the labels menitoned above.

diff --git a/...mplates/alerting-rules/loki.all.rules.yml → ...s/templates/alerting-rules/loki.rules.yml b/...mplates/alerting-rules/loki.all.rules.yml → ...s/templates/alerting-rules/loki.rules.yml
@@ -3,12 +3,32 @@ kind: PrometheusRule
 metadata:
   labels:
     {{- include "labels.common" . | nindent 4 }}
-  name: grafana.all.rules
+  name: loki.rules
   namespace: {{ .Values.namespace }}
 spec:
   groups:
   - name: loki
     rules:
+    # Coming from https://github.com/giantswarm/giantswarm/issues/30124
+    # This alert ensures Loki containers are not restarting too often (flappiness).
+    # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
+    - alert: LokiRestartingTooOften
+      annotations:
+        description: '{{`Loki containers are restarting too often.`}}'
+        opsrecipe: loki/
+      expr: |
+        increase(
+          kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}[1h]
+        ) > 5
+      for: 5m
+      labels:
+        area: managedservices
+        # This label is used to ensure the alert go through even for non-stable installations
+        all_pipelines: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
     # Rules inspired from loki-mixins - https://github.com/grafana/loki/blob/main/production/loki-mixin-compiled/alerts.yaml
     - alert: LokiRequestErrors
       annotations:

diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml
@@ -24,18 +24,36 @@ spec:
         type: "heartbeat"
         # TODO(@team-atlas): remove once we use mimir alertmanager
         namespace: "monitoring" # Needed due to https://github.com/prometheus-operator/prometheus-operator/issues/3737
+    # Coming from https://github.com/giantswarm/giantswarm/issues/30124
+    # This alert ensures Mimir containers are not restarting too often (flappiness).
+    # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive).
+    # This alert will not page for the prometheus-buddy.
+    - alert: MimirRestartingTooOften
+      annotations:
+        description: '{{`Mimir containers are restarting too often.`}}'
+      expr: |
+        increase(
+          kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container!="prometheus"}[1h]
+        ) > 5
+      for: 5m
+      labels:
+        area: managedservices
+        # This label is used to ensure the alert go through even for non-stable installations
+        all_pipelines: "true"
+        cancel_if_outside_working_hours: "true"
+        severity: page
+        team: atlas
+        topic: observability
     - alert: MimirComponentDown
       annotations:
         description: '{{`Mimir component : {{ $labels.service }} is down.`}}'
       expr: count(up{app="mimir"} == 0) by (cluster_id, service) > 0
       for: 5m
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
-        cancel_if_scrape_timeout: "true"
         cancel_if_outside_working_hours: "true"
         severity: page
         team: atlas
@@ -47,7 +65,6 @@ spec:
       for: 1h
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"
@@ -62,7 +79,6 @@ spec:
       for: 1h
       labels:
         area: managedservices
-        cancel_if_apiserver_down: "true"
         cancel_if_cluster_status_creating: "true"
         cancel_if_cluster_status_deleting: "true"
         cancel_if_cluster_status_updating: "true"

diff --git a/.../providers/global/loki.all.rules.test.yml → ...ests/providers/global/loki.rules.test.yml b/.../providers/global/loki.all.rules.test.yml → ...ests/providers/global/loki.rules.test.yml
@@ -1,6 +1,6 @@
 ---
 rule_files:
-  - loki.all.rules.yml
+  - loki.rules.yml
 
 tests:
   - interval: 1m
@@ -98,3 +98,29 @@ tests:
             exp_annotations:
               description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members"
               opsrecipe: "loki/"
+  - interval: 1m
+    input_series:
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}'
+        values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
+    alert_rule_test:
+      - alertname: LokiRestartingTooOften
+        eval_time: 15m  # should be OK after 15 minutes
+        exp_alerts:
+      - alertname: LokiRestartingTooOften
+        eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
+        exp_alerts:
+          - exp_labels:
+              all_pipelines: true
+              area: managedservices
+              cancel_if_outside_working_hours: "true"
+              cluster_type: management_cluster
+              namespace: loki
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Loki containers are restarting too often.
+              opsrecipe: loki/
+      - alertname: LokiRestartingTooOften
+        eval_time: 140m  # After 140m minutes, all should be back to normal
+        exp_alerts:
diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml
@@ -71,11 +71,9 @@ tests:
               severity: page
               team: atlas
               topic: observability
-              cancel_if_apiserver_down: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
               cancel_if_cluster_status_updating: "true"
-              cancel_if_scrape_timeout: "true"
               cancel_if_outside_working_hours: "true"
               cluster_id: gauss
             exp_annotations:
@@ -95,7 +93,6 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
-              cancel_if_apiserver_down: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -119,7 +116,6 @@ tests:
         exp_alerts:
           - exp_labels:
               area: managedservices
-              cancel_if_apiserver_down: "true"
               cancel_if_outside_working_hours: "true"
               cancel_if_cluster_status_creating: "true"
               cancel_if_cluster_status_deleting: "true"
@@ -135,3 +131,31 @@ tests:
               description: "Mimir ruler is failing to process PrometheusRules."
       - alertname: MimirRulerEventsFailed
         eval_time: 160m
+  - interval: 1m
+    input_series:
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}'
+        values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
+      - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}'
+        values: "0+5x180"                 # prometheus container restarts 5 times per minute for 180 minutes
+    alert_rule_test:
+      - alertname: MimirRestartingTooOften
+        eval_time: 15m  # should be OK after 15 minutes
+        exp_alerts:
+      - alertname: MimirRestartingTooOften
+        eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
+        exp_alerts:
+          - exp_labels:
+              all_pipelines: true
+              area: managedservices
+              cancel_if_outside_working_hours: "true"
+              cluster_type: management_cluster
+              container: mimir-ingester
+              namespace: mimir
+              severity: page
+              team: atlas
+              topic: observability
+            exp_annotations:
+              description: Mimir containers are restarting too often.
+      - alertname: MimirRestartingTooOften
+        eval_time: 140m  # After 140m minutes, all should be back to normal
+        exp_alerts: