diff --git a/CHANGELOG.md b/CHANGELOG.md index 187a4a3c5..1e76dde01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add Heartbeat alert for mimir. +- Add missing alert about loki containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). +- Add missing alert about mimir containers not running to ensure we do not suffer from [extra cloud cost](https://github.com/giantswarm/giantswarm/issues/30124). ## [3.5.0] - 2024-03-27 diff --git a/README.md b/README.md index a016b43e1..d485a7718 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,10 @@ Any Alert includes: - `cancel_if_.*` +### Specific alert labels + +- `all_pipelines: true`: When adding this label to an alert, you are sure the alert will be send to opsgenie, even if the installation is not a stable installation. + #### Routing Alertmanager does the routing based on the labels menitoned above. diff --git a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml similarity index 74% rename from helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml rename to helm/prometheus-rules/templates/alerting-rules/loki.rules.yml index 9eb724803..9f87870bf 100644 --- a/helm/prometheus-rules/templates/alerting-rules/loki.all.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/loki.rules.yml @@ -3,12 +3,32 @@ kind: PrometheusRule metadata: labels: {{- include "labels.common" . | nindent 4 }} - name: grafana.all.rules + name: loki.rules namespace: {{ .Values.namespace }} spec: groups: - name: loki rules: + # Coming from https://github.com/giantswarm/giantswarm/issues/30124 + # This alert ensures Loki containers are not restarting too often (flappiness). + # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). + - alert: LokiRestartingTooOften + annotations: + description: '{{`Loki containers are restarting too often.`}}' + opsrecipe: loki/ + expr: | + increase( + kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}[1h] + ) > 5 + for: 5m + labels: + area: managedservices + # This label is used to ensure the alert go through even for non-stable installations + all_pipelines: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability # Rules inspired from loki-mixins - https://github.com/grafana/loki/blob/main/production/loki-mixin-compiled/alerts.yaml - alert: LokiRequestErrors annotations: diff --git a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml index f19b9ca05..281e7479f 100644 --- a/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/alerting-rules/mimir.rules.yml @@ -24,6 +24,26 @@ spec: type: "heartbeat" # TODO(@team-atlas): remove once we use mimir alertmanager namespace: "monitoring" # Needed due to https://github.com/prometheus-operator/prometheus-operator/issues/3737 + # Coming from https://github.com/giantswarm/giantswarm/issues/30124 + # This alert ensures Mimir containers are not restarting too often (flappiness). + # If it is not the the case, this can incur high costs by cloud providers (s3 api calls are quite expensive). + # This alert will not page for the prometheus-buddy. + - alert: MimirRestartingTooOften + annotations: + description: '{{`Mimir containers are restarting too often.`}}' + expr: | + increase( + kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container!="prometheus"}[1h] + ) > 5 + for: 5m + labels: + area: managedservices + # This label is used to ensure the alert go through even for non-stable installations + all_pipelines: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability - alert: MimirComponentDown annotations: description: '{{`Mimir component : {{ $labels.service }} is down.`}}' @@ -31,11 +51,9 @@ spec: for: 5m labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" severity: page team: atlas @@ -47,7 +65,6 @@ spec: for: 1h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" @@ -62,7 +79,6 @@ spec: for: 1h labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" diff --git a/test/tests/providers/global/loki.all.rules.test.yml b/test/tests/providers/global/loki.rules.test.yml similarity index 83% rename from test/tests/providers/global/loki.all.rules.test.yml rename to test/tests/providers/global/loki.rules.test.yml index 03bb95fe6..d3cfab8b9 100644 --- a/test/tests/providers/global/loki.all.rules.test.yml +++ b/test/tests/providers/global/loki.rules.test.yml @@ -1,6 +1,6 @@ --- rule_files: - - loki.all.rules.yml + - loki.rules.yml tests: - interval: 1m @@ -98,3 +98,29 @@ tests: exp_annotations: description: "Loki pod loki-compactor-676b8c897b-rq298 (namespace loki) sees 1 unhealthy ring members" opsrecipe: "loki/" + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="loki"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + alert_rule_test: + - alertname: LokiRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + exp_alerts: + - alertname: LokiRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: true + area: managedservices + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + namespace: loki + severity: page + team: atlas + topic: observability + exp_annotations: + description: Loki containers are restarting too often. + opsrecipe: loki/ + - alertname: LokiRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + exp_alerts: diff --git a/test/tests/providers/global/mimir.rules.test.yml b/test/tests/providers/global/mimir.rules.test.yml index ce66772fc..d67e708c4 100644 --- a/test/tests/providers/global/mimir.rules.test.yml +++ b/test/tests/providers/global/mimir.rules.test.yml @@ -71,11 +71,9 @@ tests: severity: page team: atlas topic: observability - cancel_if_apiserver_down: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" - cancel_if_scrape_timeout: "true" cancel_if_outside_working_hours: "true" cluster_id: gauss exp_annotations: @@ -95,7 +93,6 @@ tests: exp_alerts: - exp_labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -119,7 +116,6 @@ tests: exp_alerts: - exp_labels: area: managedservices - cancel_if_apiserver_down: "true" cancel_if_outside_working_hours: "true" cancel_if_cluster_status_creating: "true" cancel_if_cluster_status_deleting: "true" @@ -135,3 +131,31 @@ tests: description: "Mimir ruler is failing to process PrometheusRules." - alertname: MimirRulerEventsFailed eval_time: 160m + - interval: 1m + input_series: + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="mimir-ingester"}' + values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes + - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="mimir", container="prometheus"}' + values: "0+5x180" # prometheus container restarts 5 times per minute for 180 minutes + alert_rule_test: + - alertname: MimirRestartingTooOften + eval_time: 15m # should be OK after 15 minutes + exp_alerts: + - alertname: MimirRestartingTooOften + eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error + exp_alerts: + - exp_labels: + all_pipelines: true + area: managedservices + cancel_if_outside_working_hours: "true" + cluster_type: management_cluster + container: mimir-ingester + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + description: Mimir containers are restarting too often. + - alertname: MimirRestartingTooOften + eval_time: 140m # After 140m minutes, all should be back to normal + exp_alerts: