From 893f0b806a43133118967b890a8db3a64c0830a9 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 28 Mar 2024 17:00:19 +0100 Subject: [PATCH 1/2] =?UTF-8?q?Alert=20when=20mimir=20components=20are=20r?= =?UTF-8?q?estarting=20too=20often=20accross=20all=20pipe=E2=80=A6=20(#109?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Alert when mimir components are restarting too often accross all pipelines to avoid high storage cost * Ignore the prometheus-buddy * Improve tests --- CHANGELOG.md | 4 +++ .../templates/alerting-rules/mimir.rules.yml | 24 +++++++++++--- .../providers/global/mimir.rules.test.yml | 32 ++++++++++++++++--- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cf111ede..203098375 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). + ## [3.5.0] - 2024-03-27 ### Changed diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml index 9d0c4f8b0..9b1575dd1 100644 --- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml @@ -9,6 +9,26 @@ spec: groups: - name: mimir rules: + # Coming from https://github.com/giantswarm/giantswarm/issues/30124 + # This alert ensures Mimir containers are not restarting too often (flappiness). + # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). + # This alert will not page for the prometheus-buddy. + - alert: MimirRestartingTooOften + annotations: + description: '{{`Mimir containers are restarting too often.`}}' + expr: | + increase( + kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container!="prometheus"}[1h] + ) > 5 + for: 5m + labels: + area: managedservices + # This label is used to ensure the alert go through even for non-stable installations + all_pipelines: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: MimirComponentDown annotations: description: '{{`Mimir component : {{ $labels.service }} is down.`}}' @@ -16,11 +36,9 @@ spec: for: 5m labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -32,7 +50,6 @@ spec: for: 1h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -47,7 +64,6 @@ spec: for: 1h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml index f46a2b21d..a5799b3a8 100644 --- a/test/tests/providers/global/mimir.rules.test.yml +++ b/test/tests/providers/global/mimir.rules.test.yml @@ -22,11 +22,9 @@ tests: severity: page team: atlas topic: observability - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" cluster_id: gauss exp_annotations: @@ -46,7 +44,6 @@ tests: exp_alerts: - exp_labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -70,7 +67,6 @@ tests: exp_alerts: - exp_labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -86,3 +82,31 @@ tests: description: "Mimir ruler is failing to process PrometheusRules." - alertname: MimirRulerEventsFailed eval_time: 160m + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}' + values: "0+5x180" # prometheus container restarts 5 times per minute for 180 minutes + alert_rule_test: + - alertname: MimirRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + exp_alerts: + - alertname: MimirRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: true + area: managedservices + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + container: mimir-ingester + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir containers are restarting too often. + - alertname: MimirRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + exp_alerts: From 7c98e67e2aa7f909431253ef1cfcde3af27883ea Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Thu, 28 Mar 2024 17:02:29 +0100 Subject: [PATCH 2/2] Add loki not running alert to avoid high cloud costs (#1090) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add loki not running alert to avoid high cloud costs * Use pod container restarts instead of statefulset available * Update test/tests/providers/global/loki.rules.test.yml Co-authored-by: Hervé Nicol * Update CHANGELOG.md * Update test/tests/providers/global/loki.rules.test.yml --------- Co-authored-by: Hervé Nicol --- CHANGELOG.md | 4 ++- README.md | 4 +++ .../{loki.all.rules.yml => loki.rules.yml} | 22 ++++++++++++++- ...all.rules.test.yml => loki.rules.test.yml} | 28 ++++++++++++++++++- 4 files changed, 55 insertions(+), 3 deletions(-) rename helm/prometheus-rules/templates/alerting-rules/{loki.all.rules.yml => loki.rules.yml} (74%) rename test/tests/providers/global/{loki.all.rules.test.yml => loki.rules.test.yml} (83%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 203098375..d62f33b5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Changed +### Added + +- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). - Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). ## [3.5.0] - 2024-03-27 diff --git a/README.md b/README.md index a016b43e1..d485a7718 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,10 @@ Any Alert includes: - `cancel_if_.*` +### Specific alert labels + +- `all_pipelines: true`: When adding this label to an alert, you are sure the alert will be send to opsgenie, even if the installation is not a stable installation. + #### Routing Alertmanager does the routing based on the labels menitoned above. diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml similarity index 74% rename from helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml rename to helm/prometheus-rules/templates/alerting-rules/loki.rules.yml index 9eb724803..9f87870bf 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml @@ -3,12 +3,32 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: grafana.all.rules + name: loki.rules namespace: {{ .Values.namespace }} spec: groups: - name: loki rules: + # Coming from https://github.com/giantswarm/giantswarm/issues/30124 + # This alert ensures Loki containers are not restarting too often (flappiness). + # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). + - alert: LokiRestartingTooOften + annotations: + description: '{{`Loki containers are restarting too often.`}}' + opsrecipe: loki/ + expr: | + increase( + kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}[1h] + ) > 5 + for: 5m + labels: + area: managedservices + # This label is used to ensure the alert go through even for non-stable installations + all_pipelines: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability # Rules inspired from loki-mixins - https://github.com/grafana/loki/blob/main/production/loki-mixin-compiled/alerts.yaml - alert: LokiRequestErrors annotations: diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.rules.test.yml similarity index 83% rename from test/tests/providers/global/loki.all.rules.test.yml rename to test/tests/providers/global/loki.rules.test.yml index 03bb95fe6..d3cfab8b9 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: - - loki.all.rules.yml + - loki.rules.yml tests: - interval: 1m @@ -98,3 +98,29 @@ tests: exp_annotations: description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members" opsrecipe: "loki/" + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + alert_rule_test: + - alertname: LokiRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + exp_alerts: + - alertname: LokiRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: true + area: managedservices + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + namespace: loki + severity: page + team: atlas + topic: observability + exp_annotations: + description: Loki containers are restarting too often. + opsrecipe: loki/ + - alertname: LokiRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + exp_alerts: